Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/xfs
172 files changed, 114893 insertions, 0 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
new file mode 100644
index 000000000000..c92306f0fdc5
--- /dev/null
+++ b/fs/xfs/Kconfig
@@ -0,0 +1,85 @@
+menu "XFS support"
+config XFS_FS
+        tristate "XFS filesystem support"
+        select EXPORTFS if NFSD!=n
+        help
+          XFS is a high performance journaling filesystem which originated
+          on the SGI IRIX platform.  It is completely multi-threaded, can
+          support large files and large filesystems, extended attributes,
+          variable block sizes, is extent based, and makes extensive use of
+          Btrees (directories, extents, free space) to aid both performance
+          and scalability.
+          Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+          for complete details.  This implementation is on-disk compatible
+          with the IRIX version of XFS.
+          To compile this file system support as a module, choose M here: the
+          module will be called xfs.  Be aware, however, that if the file
+          system of your root partition is compiled as a module, you'll need
+          to use an initial ramdisk (initrd) to boot.
+config XFS_EXPORT
+        bool
+        default y if XFS_FS && EXPORTFS
+config XFS_RT
+        bool "Realtime support (EXPERIMENTAL)"
+        depends on XFS_FS && EXPERIMENTAL
+        help
+          If you say Y here you will be able to mount and use XFS filesystems
+          which contain a realtime subvolume. The realtime subvolume is a
+          separate area of disk space where only file data is stored. The
+          realtime subvolume is designed to provide very deterministic
+          data rates suitable for media streaming applications.
+          See the xfs man page in section 5 for a bit more information.
+          This feature is unsupported at this time, is not yet fully
+          functional, and may cause serious problems.
+          If unsure, say N.
+config XFS_QUOTA
+        bool "Quota support"
+        depends on XFS_FS
+        help
+          If you say Y here, you will be able to set limits for disk usage on
+          a per user and/or a per group basis under XFS.  XFS considers quota
+          information as filesystem metadata and uses journaling to provide a
+          higher level guarantee of consistency.  The on-disk data format for
+          quota is also compatible with the IRIX version of XFS, allowing a
+          filesystem to be migrated between Linux and IRIX without any need
+          for conversion.
+          If unsure, say N.  More comprehensive documentation can be found in
+          README.quota in the xfsprogs package.  XFS quota can be used either
+          with or without the generic quota support enabled (CONFIG_QUOTA) -
+          they are completely independent subsystems.
+config XFS_SECURITY
+        bool "Security Label support"
+        depends on XFS_FS
+        help
+          Security labels support alternative access control models
+          implemented by security modules like SELinux.  This option
+          enables an extended attribute namespace for inode security
+          labels in the XFS filesystem.
+          If you are not using a security module that requires using
+          extended attributes for inode security labels, say N.
+config XFS_POSIX_ACL
+        bool "POSIX ACL support"
+        depends on XFS_FS
+        help
+          POSIX Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N.
+endmenu
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
new file mode 100644
index 000000000000..554e4a18c152
--- /dev/null
+++ b/fs/xfs/Makefile
@@ -0,0 +1,150 @@
+#
+# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.  Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+EXTRA_CFLAGS +=  -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
+ifeq ($(CONFIG_XFS_DEBUG),y)
+        EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
+        EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
+endif
+ifeq ($(CONFIG_XFS_TRACE),y)
+        EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
+        EXTRA_CFLAGS += -DXFS_ATTR_TRACE
+        EXTRA_CFLAGS += -DXFS_BLI_TRACE
+        EXTRA_CFLAGS += -DXFS_BMAP_TRACE
+        EXTRA_CFLAGS += -DXFS_BMBT_TRACE
+        EXTRA_CFLAGS += -DXFS_DIR_TRACE
+        EXTRA_CFLAGS += -DXFS_DIR2_TRACE
+        EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
+        EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
+        EXTRA_CFLAGS += -DXFS_LOG_TRACE
+        EXTRA_CFLAGS += -DXFS_RW_TRACE
+        EXTRA_CFLAGS += -DPAGEBUF_TRACE
+        # EXTRA_CFLAGS += -DXFS_VNODE_TRACE
+endif
+obj-$(CONFIG_XFS_FS)            += xfs.o
+xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
+                                   xfs_dquot.o \
+                                   xfs_dquot_item.o \
+                                   xfs_trans_dquot.o \
+                                   xfs_qm_syscalls.o \
+                                   xfs_qm_bhv.o \
+                                   xfs_qm.o)
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
+endif
+xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
+xfs-$(CONFIG_PROC_FS)           += linux-2.6/xfs_stats.o
+xfs-$(CONFIG_SYSCTL)            += linux-2.6/xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)            += linux-2.6/xfs_ioctl32.o
+xfs-$(CONFIG_XFS_EXPORT)        += linux-2.6/xfs_export.o
+xfs-y                           += xfs_alloc.o \
+                                   xfs_alloc_btree.o \
+                                   xfs_attr.o \
+                                   xfs_attr_leaf.o \
+                                   xfs_behavior.o \
+                                   xfs_bit.o \
+                                   xfs_bmap.o \
+                                   xfs_bmap_btree.o \
+                                   xfs_btree.o \
+                                   xfs_buf_item.o \
+                                   xfs_da_btree.o \
+                                   xfs_dir.o \
+                                   xfs_dir2.o \
+                                   xfs_dir2_block.o \
+                                   xfs_dir2_data.o \
+                                   xfs_dir2_leaf.o \
+                                   xfs_dir2_node.o \
+                                   xfs_dir2_sf.o \
+                                   xfs_dir_leaf.o \
+                                   xfs_error.o \
+                                   xfs_extfree_item.o \
+                                   xfs_fsops.o \
+                                   xfs_ialloc.o \
+                                   xfs_ialloc_btree.o \
+                                   xfs_iget.o \
+                                   xfs_inode.o \
+                                   xfs_inode_item.o \
+                                   xfs_iocore.o \
+                                   xfs_iomap.o \
+                                   xfs_itable.o \
+                                   xfs_dfrag.o \
+                                   xfs_log.o \
+                                   xfs_log_recover.o \
+                                   xfs_macros.o \
+                                   xfs_mount.o \
+                                   xfs_rename.o \
+                                   xfs_trans.o \
+                                   xfs_trans_ail.o \
+                                   xfs_trans_buf.o \
+                                   xfs_trans_extfree.o \
+                                   xfs_trans_inode.o \
+                                   xfs_trans_item.o \
+                                   xfs_utils.o \
+                                   xfs_vfsops.o \
+                                   xfs_vnodeops.o \
+                                   xfs_rw.o \
+                                   xfs_dmops.o \
+                                   xfs_qmops.o
+xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
+# Objects in linux-2.6/
+xfs-y                           += $(addprefix linux-2.6/, \
+                                   kmem.o \
+                                   xfs_aops.o \
+                                   xfs_buf.o \
+                                   xfs_file.o \
+                                   xfs_fs_subr.o \
+                                   xfs_globals.o \
+                                   xfs_ioctl.o \
+                                   xfs_iops.o \
+                                   xfs_lrw.o \
+                                   xfs_super.o \
+                                   xfs_vfs.o \
+                                   xfs_vnode.o)
+# Objects in support/
+xfs-y                           += $(addprefix support/, \
+                                   debug.o \
+                                   move.o \
+                                   qsort.o \
+                                   uuid.o)
+xfs-$(CONFIG_XFS_TRACE)         += support/ktrace.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
new file mode 100644
index 000000000000..364ea8c386b1
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+#include <linux/blkdev.h>
+#include "time.h"
+#include "kmem.h"
+#define MAX_VMALLOCS    6
+#define MAX_SLAB_SIZE   0x20000
+void *
+kmem_alloc(size_t size, int flags)
+{
+        int     retries = 0;
+        int     lflags = kmem_flags_convert(flags);
+        void    *ptr;
+        do {
+                if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
+                        ptr = kmalloc(size, lflags);
+                else
+                        ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                        return ptr;
+                if (!(++retries % 100))
+                        printk(KERN_ERR "XFS: possible memory allocation "
+                                        "deadlock in %s (mode:0x%x)\n",
+                                        __FUNCTION__, lflags);
+                blk_congestion_wait(WRITE, HZ/50);
+        } while (1);
+}
+void *
+kmem_zalloc(size_t size, int flags)
+{
+        void    *ptr;
+        ptr = kmem_alloc(size, flags);
+        if (ptr)
+                memset((char *)ptr, 0, (int)size);
+        return ptr;
+}
+void
+kmem_free(void *ptr, size_t size)
+{
+        if (((unsigned long)ptr < VMALLOC_START) ||
+            ((unsigned long)ptr >= VMALLOC_END)) {
+                kfree(ptr);
+        } else {
+                vfree(ptr);
+        }
+}
+void *
+kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
+{
+        void    *new;
+        new = kmem_alloc(newsize, flags);
+        if (ptr) {
+                if (new)
+                        memcpy(new, ptr,
+                                ((oldsize < newsize) ? oldsize : newsize));
+                kmem_free(ptr, oldsize);
+        }
+        return new;
+}
+void *
+kmem_zone_alloc(kmem_zone_t *zone, int flags)
+{
+        int     retries = 0;
+        int     lflags = kmem_flags_convert(flags);
+        void    *ptr;
+        do {
+                ptr = kmem_cache_alloc(zone, lflags);
+                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                        return ptr;
+                if (!(++retries % 100))
+                        printk(KERN_ERR "XFS: possible memory allocation "
+                                        "deadlock in %s (mode:0x%x)\n",
+                                        __FUNCTION__, lflags);
+                blk_congestion_wait(WRITE, HZ/50);
+        } while (1);
+}
+void *
+kmem_zone_zalloc(kmem_zone_t *zone, int flags)
+{
+        void    *ptr;
+        ptr = kmem_zone_alloc(zone, flags);
+        if (ptr)
+                memset((char *)ptr, 0, kmem_cache_size(zone));
+        return ptr;
+}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
new file mode 100644
index 000000000000..1397b669b059
--- /dev/null
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+/*
+ * memory management routines
+ */
+#define KM_SLEEP        0x0001
+#define KM_NOSLEEP      0x0002
+#define KM_NOFS         0x0004
+#define KM_MAYFAIL      0x0008
+#define kmem_zone       kmem_cache_s
+#define kmem_zone_t     kmem_cache_t
+typedef unsigned long xfs_pflags_t;
+#define PFLAGS_TEST_NOIO()              (current->flags & PF_NOIO)
+#define PFLAGS_TEST_FSTRANS()           (current->flags & PF_FSTRANS)
+#define PFLAGS_SET_NOIO() do {          \
+        current->flags |= PF_NOIO;      \
+} while (0)
+#define PFLAGS_CLEAR_NOIO() do {        \
+        current->flags &= ~PF_NOIO;     \
+} while (0)
+/* these could be nested, so we save state */
+#define PFLAGS_SET_FSTRANS(STATEP) do { \
+        *(STATEP) = current->flags;     \
+        current->flags |= PF_FSTRANS;   \
+} while (0)
+#define PFLAGS_CLEAR_FSTRANS(STATEP) do { \
+        *(STATEP) = current->flags;     \
+        current->flags &= ~PF_FSTRANS;  \
+} while (0)
+/* Restore the PF_FSTRANS state to what was saved in STATEP */
+#define PFLAGS_RESTORE_FSTRANS(STATEP) do {                     \
+        current->flags = ((current->flags & ~PF_FSTRANS) |      \
+                          (*(STATEP) & PF_FSTRANS));            \
+} while (0)
+#define PFLAGS_DUP(OSTATEP, NSTATEP) do { \
+        *(NSTATEP) = *(OSTATEP);        \
+} while (0)
+static __inline unsigned int kmem_flags_convert(int flags)
+{
+        int     lflags = __GFP_NOWARN;  /* we'll report problems, if need be */
+#ifdef DEBUG
+        if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
+                printk(KERN_WARNING
+                    "XFS: memory allocation with wrong flags (%x)\n", flags);
+                BUG();
+        }
+#endif
+        if (flags & KM_NOSLEEP) {
+                lflags |= GFP_ATOMIC;
+        } else {
+                lflags |= GFP_KERNEL;
+                /* avoid recusive callbacks to filesystem during transactions */
+                if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS))
+                        lflags &= ~__GFP_FS;
+        }
+        
+        return lflags;
+}
+static __inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+        return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
+}
+static __inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+        kmem_cache_free(zone, ptr);
+}
+static __inline void
+kmem_zone_destroy(kmem_zone_t *zone)
+{
+        if (zone && kmem_cache_destroy(zone))
+                BUG();
+}
+extern void         *kmem_zone_zalloc(kmem_zone_t *, int);
+extern void         *kmem_zone_alloc(kmem_zone_t *, int);
+extern void         *kmem_alloc(size_t, int);
+extern void         *kmem_realloc(void *, size_t, size_t, int);
+extern void         *kmem_zalloc(size_t, int);
+extern void         kmem_free(void *, size_t);
+typedef struct shrinker *kmem_shaker_t;
+typedef int (*kmem_shake_func_t)(int, unsigned int);
+static __inline kmem_shaker_t
+kmem_shake_register(kmem_shake_func_t sfunc)
+{
+        return set_shrinker(DEFAULT_SEEKS, sfunc);
+}
+static __inline void
+kmem_shake_deregister(kmem_shaker_t shrinker)
+{
+        remove_shrinker(shrinker);
+}
+static __inline int
+kmem_shake_allow(unsigned int gfp_mask)
+{
+        return (gfp_mask & __GFP_WAIT);
+}
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
new file mode 100644
index 000000000000..d2c11a098ff2
--- /dev/null
+++ b/fs/xfs/linux-2.6/mrlock.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+#include <linux/rwsem.h>
+enum { MR_NONE, MR_ACCESS, MR_UPDATE };
+typedef struct {
+        struct rw_semaphore     mr_lock;
+        int                     mr_writer;
+} mrlock_t;
+#define mrinit(mrp, name)       \
+        ( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+#define mrlock_init(mrp, t,n,s) mrinit(mrp, n)
+#define mrfree(mrp)             do { } while (0)
+#define mraccess(mrp)           mraccessf(mrp, 0)
+#define mrupdate(mrp)           mrupdatef(mrp, 0)
+static inline void mraccessf(mrlock_t *mrp, int flags)
+{
+        down_read(&mrp->mr_lock);
+}
+static inline void mrupdatef(mrlock_t *mrp, int flags)
+{
+        down_write(&mrp->mr_lock);
+        mrp->mr_writer = 1;
+}
+static inline int mrtryaccess(mrlock_t *mrp)
+{
+        return down_read_trylock(&mrp->mr_lock);
+}
+static inline int mrtryupdate(mrlock_t *mrp)
+{
+        if (!down_write_trylock(&mrp->mr_lock))
+                return 0;
+        mrp->mr_writer = 1;
+        return 1;
+}
+static inline void mrunlock(mrlock_t *mrp)
+{
+        if (mrp->mr_writer) {
+                mrp->mr_writer = 0;
+                up_write(&mrp->mr_lock);
+        } else {
+                up_read(&mrp->mr_lock);
+        }
+}
+static inline void mrdemote(mrlock_t *mrp)
+{
+        mrp->mr_writer = 0;
+        downgrade_write(&mrp->mr_lock);
+}
+#ifdef DEBUG
+/*
+ * Debug-only routine, without some platform-specific asm code, we can
+ * now only answer requests regarding whether we hold the lock for write
+ * (reader state is outside our visibility, we only track writer state).
+ * Note: means !ismrlocked would give false positivies, so don't do that.
+ */
+static inline int ismrlocked(mrlock_t *mrp, int type)
+{
+        if (mrp && type == MR_UPDATE)
+                return mrp->mr_writer;
+        return 1;
+}
+#endif
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h
new file mode 100644
index 000000000000..0b296bb944cb
--- /dev/null
+++ b/fs/xfs/linux-2.6/mutex.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MUTEX_H__
+#define __XFS_SUPPORT_MUTEX_H__
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+/*
+ * Map the mutex'es from IRIX to Linux semaphores.
+ *
+ * Destroy just simply initializes to -99 which should block all other
+ * callers.
+ */
+#define MUTEX_DEFAULT           0x0
+typedef struct semaphore        mutex_t;
+#define mutex_init(lock, type, name)            sema_init(lock, 1)
+#define mutex_destroy(lock)                     sema_init(lock, -99)
+#define mutex_lock(lock, num)                   down(lock)
+#define mutex_trylock(lock)                     (down_trylock(lock) ? 0 : 1)
+#define mutex_unlock(lock)                      up(lock)
+#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
new file mode 100644
index 000000000000..30b67b4e1cbf
--- /dev/null
+++ b/fs/xfs/linux-2.6/sema.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SEMA_H__
+#define __XFS_SUPPORT_SEMA_H__
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+/*
+ * sema_t structure just maps to struct semaphore in Linux kernel.
+ */
+typedef struct semaphore sema_t;
+#define init_sema(sp, val, c, d)        sema_init(sp, val)
+#define initsema(sp, val)               sema_init(sp, val)
+#define initnsema(sp, val, name)        sema_init(sp, val)
+#define psema(sp, b)                    down(sp)
+#define vsema(sp)                       up(sp)
+#define valusema(sp)                    (atomic_read(&(sp)->count))
+#define freesema(sema)
+/*
+ * Map cpsema (try to get the sema) to down_trylock. We need to switch
+ * the return values since cpsema returns 1 (acquired) 0 (failed) and
+ * down_trylock returns the reverse 0 (acquired) 1 (failed).
+ */
+#define cpsema(sp)                      (down_trylock(sp) ? 0 : 1)
+/*
+ * Didn't do cvsema(sp). Not sure how to map this to up/down/...
+ * It does a vsema if the values is < 0 other wise nothing.
+ */
+#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h
new file mode 100644
index 000000000000..bcf60a0b8df0
--- /dev/null
+++ b/fs/xfs/linux-2.6/spin.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SPIN_H__
+#define __XFS_SUPPORT_SPIN_H__
+#include <linux/sched.h>        /* preempt needs this */
+#include <linux/spinlock.h>
+/*
+ * Map lock_t from IRIX to Linux spinlocks.
+ *
+ * We do not make use of lock_t from interrupt context, so we do not
+ * have to worry about disabling interrupts at all (unlike IRIX).
+ */
+typedef spinlock_t lock_t;
+#define SPLDECL(s)                      unsigned long s
+#define spinlock_init(lock, name)       spin_lock_init(lock)
+#define spinlock_destroy(lock)
+#define mutex_spinlock(lock)            ({ spin_lock(lock); 0; })
+#define mutex_spinunlock(lock, s)       do { spin_unlock(lock); (void)s; } while (0)
+#define nested_spinlock(lock)           spin_lock(lock)
+#define nested_spinunlock(lock)         spin_unlock(lock)
+#endif /* __XFS_SUPPORT_SPIN_H__ */
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
new file mode 100644
index 000000000000..821d3167e05b
--- /dev/null
+++ b/fs/xfs/linux-2.6/sv.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SV_H__
+#define __XFS_SUPPORT_SV_H__
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+/*
+ * Synchronisation variables.
+ *
+ * (Parameters "pri", "svf" and "rts" are not implemented)
+ */
+typedef struct sv_s {
+        wait_queue_head_t waiters;
+} sv_t;
+#define SV_FIFO         0x0             /* sv_t is FIFO type */
+#define SV_LIFO         0x2             /* sv_t is LIFO type */
+#define SV_PRIO         0x4             /* sv_t is PRIO type */
+#define SV_KEYED        0x6             /* sv_t is KEYED type */
+#define SV_DEFAULT      SV_FIFO
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
+                             unsigned long timeout)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        add_wait_queue_exclusive(&sv->waiters, &wait);
+        __set_current_state(state);
+        spin_unlock(lock);
+        schedule_timeout(timeout);
+        remove_wait_queue(&sv->waiters, &wait);
+}
+#define init_sv(sv,type,name,flag) \
+        init_waitqueue_head(&(sv)->waiters)
+#define sv_init(sv,flag,name) \
+        init_waitqueue_head(&(sv)->waiters)
+#define sv_destroy(sv) \
+        /*NOTHING*/
+#define sv_wait(sv, pri, lock, s) \
+        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_wait_sig(sv, pri, lock, s)   \
+        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
+        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
+        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_signal(sv) \
+        wake_up(&(sv)->waiters)
+#define sv_broadcast(sv) \
+        wake_up_all(&(sv)->waiters)
+#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h
new file mode 100644
index 000000000000..6c6fd0faa8e1
--- /dev/null
+++ b/fs/xfs/linux-2.6/time.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+#include <linux/sched.h>
+#include <linux/time.h>
+typedef struct timespec timespec_t;
+static inline void delay(long ticks)
+{
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule_timeout(ticks);
+}
+static inline void nanotime(struct timespec *tvp)
+{
+        *tvp = CURRENT_TIME;
+}
+#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
new file mode 100644
index 000000000000..76a84758073a
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -0,0 +1,1275 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_trans.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_iomap.h"
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
+STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *,
+                struct writeback_control *wbc, void *, int, int);
+#if defined(XFS_RW_TRACE)
+void
+xfs_page_trace(
+        int             tag,
+        struct inode    *inode,
+        struct page     *page,
+        int             mask)
+{
+        xfs_inode_t     *ip;
+        bhv_desc_t      *bdp;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        loff_t          isize = i_size_read(inode);
+        loff_t          offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+        int             delalloc = -1, unmapped = -1, unwritten = -1;
+        if (page_has_buffers(page))
+                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
+        bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
+        ip = XFS_BHVTOI(bdp);
+        if (!ip->i_rwtrace)
+                return;
+        ktrace_enter(ip->i_rwtrace,
+                (void *)((unsigned long)tag),
+                (void *)ip,
+                (void *)inode,
+                (void *)page,
+                (void *)((unsigned long)mask),
+                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+                (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(isize & 0xffffffff)),
+                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(offset & 0xffffffff)),
+                (void *)((unsigned long)delalloc),
+                (void *)((unsigned long)unmapped),
+                (void *)((unsigned long)unwritten),
+                (void *)NULL,
+                (void *)NULL);
+}
+#else
+#define xfs_page_trace(tag, inode, page, mask)
+#endif
+void
+linvfs_unwritten_done(
+        struct buffer_head      *bh,
+        int                     uptodate)
+{
+        xfs_buf_t               *pb = (xfs_buf_t *)bh->b_private;
+        ASSERT(buffer_unwritten(bh));
+        bh->b_end_io = NULL;
+        clear_buffer_unwritten(bh);
+        if (!uptodate)
+                pagebuf_ioerror(pb, EIO);
+        if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
+                pagebuf_iodone(pb, 1, 1);
+        }
+        end_buffer_async_write(bh, uptodate);
+}
+/*
+ * Issue transactions to convert a buffer range from unwritten
+ * to written extents (buffered IO).
+ */
+STATIC void
+linvfs_unwritten_convert(
+        xfs_buf_t       *bp)
+{
+        vnode_t         *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
+        int             error;
+        BUG_ON(atomic_read(&bp->pb_hold) < 1);
+        VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp),
+                        BMAPI_UNWRITTEN, NULL, NULL, error);
+        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+        XFS_BUF_CLR_IODONE_FUNC(bp);
+        XFS_BUF_UNDATAIO(bp);
+        iput(LINVFS_GET_IP(vp));
+        pagebuf_iodone(bp, 0, 0);
+}
+/*
+ * Issue transactions to convert a buffer range from unwritten
+ * to written extents (direct IO).
+ */
+STATIC void
+linvfs_unwritten_convert_direct(
+        struct inode    *inode,
+        loff_t          offset,
+        ssize_t         size,
+        void            *private)
+{
+        ASSERT(!private || inode == (struct inode *)private);
+        /* private indicates an unwritten extent lay beneath this IO */
+        if (private && size > 0) {
+                vnode_t *vp = LINVFS_GET_VP(inode);
+                int     error;
+                VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+        }
+}
+STATIC int
+xfs_map_blocks(
+        struct inode            *inode,
+        loff_t                  offset,
+        ssize_t                 count,
+        xfs_iomap_t             *mapp,
+        int                     flags)
+{
+        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        int                     error, nmaps = 1;
+        VOP_BMAP(vp, offset, count, flags, mapp, &nmaps, error);
+        if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
+                VMODIFY(vp);
+        return -error;
+}
+/*
+ * Finds the corresponding mapping in block @map array of the
+ * given @offset within a @page.
+ */
+STATIC xfs_iomap_t *
+xfs_offset_to_map(
+        struct page             *page,
+        xfs_iomap_t             *iomapp,
+        unsigned long           offset)
+{
+        loff_t                  full_offset;    /* offset from start of file */
+        ASSERT(offset < PAGE_CACHE_SIZE);
+        full_offset = page->index;              /* NB: using 64bit number */
+        full_offset <<= PAGE_CACHE_SHIFT;       /* offset from file start */
+        full_offset += offset;                  /* offset from page start */
+        if (full_offset < iomapp->iomap_offset)
+                return NULL;
+        if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset)
+                return iomapp;
+        return NULL;
+}
+STATIC void
+xfs_map_at_offset(
+        struct page             *page,
+        struct buffer_head      *bh,
+        unsigned long           offset,
+        int                     block_bits,
+        xfs_iomap_t             *iomapp)
+{
+        xfs_daddr_t             bn;
+        loff_t                  delta;
+        int                     sector_shift;
+        ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
+        ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
+        ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
+        delta = page->index;
+        delta <<= PAGE_CACHE_SHIFT;
+        delta += offset;
+        delta -= iomapp->iomap_offset;
+        delta >>= block_bits;
+        sector_shift = block_bits - BBSHIFT;
+        bn = iomapp->iomap_bn >> sector_shift;
+        bn += delta;
+        BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME));
+        ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
+        lock_buffer(bh);
+        bh->b_blocknr = bn;
+        bh->b_bdev = iomapp->iomap_target->pbr_bdev;
+        set_buffer_mapped(bh);
+        clear_buffer_delay(bh);
+}
+/*
+ * Look for a page at index which is unlocked and contains our
+ * unwritten extent flagged buffers at its head.  Returns page
+ * locked and with an extra reference count, and length of the
+ * unwritten extent component on this page that we can write,
+ * in units of filesystem blocks.
+ */
+STATIC struct page *
+xfs_probe_unwritten_page(
+        struct address_space    *mapping,
+        pgoff_t                 index,
+        xfs_iomap_t             *iomapp,
+        xfs_buf_t               *pb,
+        unsigned long           max_offset,
+        unsigned long           *fsbs,
+        unsigned int            bbits)
+{
+        struct page             *page;
+        page = find_trylock_page(mapping, index);
+        if (!page)
+                return NULL;
+        if (PageWriteback(page))
+                goto out;
+        if (page->mapping && page_has_buffers(page)) {
+                struct buffer_head      *bh, *head;
+                unsigned long           p_offset = 0;
+                *fsbs = 0;
+                bh = head = page_buffers(page);
+                do {
+                        if (!buffer_unwritten(bh) || !buffer_uptodate(bh))
+                                break;
+                        if (!xfs_offset_to_map(page, iomapp, p_offset))
+                                break;
+                        if (p_offset >= max_offset)
+                                break;
+                        xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
+                        set_buffer_unwritten_io(bh);
+                        bh->b_private = pb;
+                        p_offset += bh->b_size;
+                        (*fsbs)++;
+                } while ((bh = bh->b_this_page) != head);
+                if (p_offset)
+                        return page;
+        }
+out:
+        unlock_page(page);
+        return NULL;
+}
+/*
+ * Look for a page at index which is unlocked and not mapped
+ * yet - clustering for mmap write case.
+ */
+STATIC unsigned int
+xfs_probe_unmapped_page(
+        struct address_space    *mapping,
+        pgoff_t                 index,
+        unsigned int            pg_offset)
+{
+        struct page             *page;
+        int                     ret = 0;
+        page = find_trylock_page(mapping, index);
+        if (!page)
+                return 0;
+        if (PageWriteback(page))
+                goto out;
+        if (page->mapping && PageDirty(page)) {
+                if (page_has_buffers(page)) {
+                        struct buffer_head      *bh, *head;
+                        bh = head = page_buffers(page);
+                        do {
+                                if (buffer_mapped(bh) || !buffer_uptodate(bh))
+                                        break;
+                                ret += bh->b_size;
+                                if (ret >= pg_offset)
+                                        break;
+                        } while ((bh = bh->b_this_page) != head);
+                } else
+                        ret = PAGE_CACHE_SIZE;
+        }
+out:
+        unlock_page(page);
+        return ret;
+}
+STATIC unsigned int
+xfs_probe_unmapped_cluster(
+        struct inode            *inode,
+        struct page             *startpage,
+        struct buffer_head      *bh,
+        struct buffer_head      *head)
+{
+        pgoff_t                 tindex, tlast, tloff;
+        unsigned int            pg_offset, len, total = 0;
+        struct address_space    *mapping = inode->i_mapping;
+        /* First sum forwards in this page */
+        do {
+                if (buffer_mapped(bh))
+                        break;
+                total += bh->b_size;
+        } while ((bh = bh->b_this_page) != head);
+        /* If we reached the end of the page, sum forwards in
+         * following pages.
+         */
+        if (bh == head) {
+                tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+                /* Prune this back to avoid pathological behavior */
+                tloff = min(tlast, startpage->index + 64);
+                for (tindex = startpage->index + 1; tindex < tloff; tindex++) {
+                        len = xfs_probe_unmapped_page(mapping, tindex,
+                                                        PAGE_CACHE_SIZE);
+                        if (!len)
+                                return total;
+                        total += len;
+                }
+                if (tindex == tlast &&
+                    (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
+                        total += xfs_probe_unmapped_page(mapping,
+                                                        tindex, pg_offset);
+                }
+        }
+        return total;
+}
+/*
+ * Probe for a given page (index) in the inode and test if it is delayed
+ * and without unwritten buffers.  Returns page locked and with an extra
+ * reference count.
+ */
+STATIC struct page *
+xfs_probe_delalloc_page(
+        struct inode            *inode,
+        pgoff_t                 index)
+{
+        struct page             *page;
+        page = find_trylock_page(inode->i_mapping, index);
+        if (!page)
+                return NULL;
+        if (PageWriteback(page))
+                goto out;
+        if (page->mapping && page_has_buffers(page)) {
+                struct buffer_head      *bh, *head;
+                int                     acceptable = 0;
+                bh = head = page_buffers(page);
+                do {
+                        if (buffer_unwritten(bh)) {
+                                acceptable = 0;
+                                break;
+                        } else if (buffer_delay(bh)) {
+                                acceptable = 1;
+                        }
+                } while ((bh = bh->b_this_page) != head);
+                if (acceptable)
+                        return page;
+        }
+out:
+        unlock_page(page);
+        return NULL;
+}
+STATIC int
+xfs_map_unwritten(
+        struct inode            *inode,
+        struct page             *start_page,
+        struct buffer_head      *head,
+        struct buffer_head      *curr,
+        unsigned long           p_offset,
+        int                     block_bits,
+        xfs_iomap_t             *iomapp,
+        struct writeback_control *wbc,
+        int                     startio,
+        int                     all_bh)
+{
+        struct buffer_head      *bh = curr;
+        xfs_iomap_t             *tmp;
+        xfs_buf_t               *pb;
+        loff_t                  offset, size;
+        unsigned long           nblocks = 0;
+        offset = start_page->index;
+        offset <<= PAGE_CACHE_SHIFT;
+        offset += p_offset;
+        /* get an "empty" pagebuf to manage IO completion
+         * Proper values will be set before returning */
+        pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0);
+        if (!pb)
+                return -EAGAIN;
+        /* Take a reference to the inode to prevent it from
+         * being reclaimed while we have outstanding unwritten
+         * extent IO on it.
+         */
+        if ((igrab(inode)) != inode) {
+                pagebuf_free(pb);
+                return -EAGAIN;
+        }
+        /* Set the count to 1 initially, this will stop an I/O
+         * completion callout which happens before we have started
+         * all the I/O from calling pagebuf_iodone too early.
+         */
+        atomic_set(&pb->pb_io_remaining, 1);
+        /* First map forwards in the page consecutive buffers
+         * covering this unwritten extent
+         */
+        do {
+                if (!buffer_unwritten(bh))
+                        break;
+                tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
+                if (!tmp)
+                        break;
+                xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
+                set_buffer_unwritten_io(bh);
+                bh->b_private = pb;
+                p_offset += bh->b_size;
+                nblocks++;
+        } while ((bh = bh->b_this_page) != head);
+        atomic_add(nblocks, &pb->pb_io_remaining);
+        /* If we reached the end of the page, map forwards in any
+         * following pages which are also covered by this extent.
+         */
+        if (bh == head) {
+                struct address_space    *mapping = inode->i_mapping;
+                pgoff_t                 tindex, tloff, tlast;
+                unsigned long           bs;
+                unsigned int            pg_offset, bbits = inode->i_blkbits;
+                struct page             *page;
+                tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+                tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
+                tloff = min(tlast, tloff);
+                for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
+                        page = xfs_probe_unwritten_page(mapping,
+                                                tindex, iomapp, pb,
+                                                PAGE_CACHE_SIZE, &bs, bbits);
+                        if (!page)
+                                break;
+                        nblocks += bs;
+                        atomic_add(bs, &pb->pb_io_remaining);
+                        xfs_convert_page(inode, page, iomapp, wbc, pb,
+                                                        startio, all_bh);
+                        /* stop if converting the next page might add
+                         * enough blocks that the corresponding byte
+                         * count won't fit in our ulong page buf length */
+                        if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
+                                goto enough;
+                }
+                if (tindex == tlast &&
+                    (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
+                        page = xfs_probe_unwritten_page(mapping,
+                                                        tindex, iomapp, pb,
+                                                        pg_offset, &bs, bbits);
+                        if (page) {
+                                nblocks += bs;
+                                atomic_add(bs, &pb->pb_io_remaining);
+                                xfs_convert_page(inode, page, iomapp, wbc, pb,
+                                                        startio, all_bh);
+                                if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
+                                        goto enough;
+                        }
+                }
+        }
+enough:
+        size = nblocks;         /* NB: using 64bit number here */
+        size <<= block_bits;    /* convert fsb's to byte range */
+        XFS_BUF_DATAIO(pb);
+        XFS_BUF_ASYNC(pb);
+        XFS_BUF_SET_SIZE(pb, size);
+        XFS_BUF_SET_COUNT(pb, size);
+        XFS_BUF_SET_OFFSET(pb, offset);
+        XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
+        XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
+        if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
+                pagebuf_iodone(pb, 1, 1);
+        }
+        return 0;
+}
+STATIC void
+xfs_submit_page(
+        struct page             *page,
+        struct writeback_control *wbc,
+        struct buffer_head      *bh_arr[],
+        int                     bh_count,
+        int                     probed_page,
+        int                     clear_dirty)
+{
+        struct buffer_head      *bh;
+        int                     i;
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        if (clear_dirty)
+                clear_page_dirty(page);
+        unlock_page(page);
+        if (bh_count) {
+                for (i = 0; i < bh_count; i++) {
+                        bh = bh_arr[i];
+                        mark_buffer_async_write(bh);
+                        if (buffer_unwritten(bh))
+                                set_buffer_unwritten_io(bh);
+                        set_buffer_uptodate(bh);
+                        clear_buffer_dirty(bh);
+                }
+                for (i = 0; i < bh_count; i++)
+                        submit_bh(WRITE, bh_arr[i]);
+                if (probed_page && clear_dirty)
+                        wbc->nr_to_write--;     /* Wrote an "extra" page */
+        } else {
+                end_page_writeback(page);
+                wbc->pages_skipped++;   /* We didn't write this page */
+        }
+}
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc/unwritten pages only, for the original page it is possible
+ * that the page has no mapping at all.
+ */
+STATIC void
+xfs_convert_page(
+        struct inode            *inode,
+        struct page             *page,
+        xfs_iomap_t             *iomapp,
+        struct writeback_control *wbc,
+        void                    *private,
+        int                     startio,
+        int                     all_bh)
+{
+        struct buffer_head      *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
+        xfs_iomap_t             *mp = iomapp, *tmp;
+        unsigned long           end, offset;
+        pgoff_t                 end_index;
+        int                     i = 0, index = 0;
+        int                     bbits = inode->i_blkbits;
+        end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+        if (page->index < end_index) {
+                end = PAGE_CACHE_SIZE;
+        } else {
+                end = i_size_read(inode) & (PAGE_CACHE_SIZE-1);
+        }
+        bh = head = page_buffers(page);
+        do {
+                offset = i << bbits;
+                if (offset >= end)
+                        break;
+                if (!(PageUptodate(page) || buffer_uptodate(bh)))
+                        continue;
+                if (buffer_mapped(bh) && all_bh &&
+                    !(buffer_unwritten(bh) || buffer_delay(bh))) {
+                        if (startio) {
+                                lock_buffer(bh);
+                                bh_arr[index++] = bh;
+                        }
+                        continue;
+                }
+                tmp = xfs_offset_to_map(page, mp, offset);
+                if (!tmp)
+                        continue;
+                ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
+                ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
+                /* If this is a new unwritten extent buffer (i.e. one
+                 * that we haven't passed in private data for, we must
+                 * now map this buffer too.
+                 */
+                if (buffer_unwritten(bh) && !bh->b_end_io) {
+                        ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
+                        xfs_map_unwritten(inode, page, head, bh, offset,
+                                        bbits, tmp, wbc, startio, all_bh);
+                } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
+                        xfs_map_at_offset(page, bh, offset, bbits, tmp);
+                        if (buffer_unwritten(bh)) {
+                                set_buffer_unwritten_io(bh);
+                                bh->b_private = private;
+                                ASSERT(private);
+                        }
+                }
+                if (startio) {
+                        bh_arr[index++] = bh;
+                } else {
+                        set_buffer_dirty(bh);
+                        unlock_buffer(bh);
+                        mark_buffer_dirty(bh);
+                }
+        } while (i++, (bh = bh->b_this_page) != head);
+        if (startio) {
+                xfs_submit_page(page, wbc, bh_arr, index, 1, index == i);
+        } else {
+                unlock_page(page);
+        }
+}
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+xfs_cluster_write(
+        struct inode            *inode,
+        pgoff_t                 tindex,
+        xfs_iomap_t             *iomapp,
+        struct writeback_control *wbc,
+        int                     startio,
+        int                     all_bh,
+        pgoff_t                 tlast)
+{
+        struct page             *page;
+        for (; tindex <= tlast; tindex++) {
+                page = xfs_probe_delalloc_page(inode, tindex);
+                if (!page)
+                        break;
+                xfs_convert_page(inode, page, iomapp, wbc, NULL,
+                                startio, all_bh);
+        }
+}
+/*
+ * Calling this without startio set means we are being asked to make a dirty
+ * page ready for freeing it's buffers.  When called with startio set then
+ * we are coming from writepage.
+ *
+ * When called with startio set it is important that we write the WHOLE
+ * page if possible.
+ * The bh->b_state's cannot know if any of the blocks or which block for
+ * that matter are dirty due to mmap writes, and therefore bh uptodate is
+ * only vaild if the page itself isn't completely uptodate.  Some layers
+ * may clear the page dirty flag prior to calling write page, under the
+ * assumption the entire page will be written out; by not writing out the
+ * whole page the page can be reused before all valid dirty data is
+ * written out.  Note: in the case of a page that has been dirty'd by
+ * mapwrite and but partially setup by block_prepare_write the
+ * bh->b_states's will not agree and only ones setup by BPW/BCW will have
+ * valid state, thus the whole page must be written out thing.
+ */
+STATIC int
+xfs_page_state_convert(
+        struct inode    *inode,
+        struct page     *page,
+        struct writeback_control *wbc,
+        int             startio,
+        int             unmapped) /* also implies page uptodate */
+{
+        struct buffer_head      *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
+        xfs_iomap_t             *iomp, iomap;
+        loff_t                  offset;
+        unsigned long           p_offset = 0;
+        __uint64_t              end_offset;
+        pgoff_t                 end_index, last_index, tlast;
+        int                     len, err, i, cnt = 0, uptodate = 1;
+        int                     flags = startio ? 0 : BMAPI_TRYLOCK;
+        int                     page_dirty, delalloc = 0;
+        /* Is this page beyond the end of the file? */
+        offset = i_size_read(inode);
+        end_index = offset >> PAGE_CACHE_SHIFT;
+        last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+        if (page->index >= end_index) {
+                if ((page->index >= end_index + 1) ||
+                    !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
+                        err = -EIO;
+                        goto error;
+                }
+        }
+        offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+        end_offset = min_t(unsigned long long,
+                        offset + PAGE_CACHE_SIZE, i_size_read(inode));
+        bh = head = page_buffers(page);
+        iomp = NULL;
+        /*
+         * page_dirty is initially a count of buffers on the page and
+         * is decrememted as we move each into a cleanable state.
+         */
+        len = bh->b_size;
+        page_dirty = PAGE_CACHE_SIZE / len;
+        do {
+                if (offset >= end_offset)
+                        break;
+                if (!buffer_uptodate(bh))
+                        uptodate = 0;
+                if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio)
+                        continue;
+                if (iomp) {
+                        iomp = xfs_offset_to_map(page, &iomap, p_offset);
+                }
+                /*
+                 * First case, map an unwritten extent and prepare for
+                 * extent state conversion transaction on completion.
+                 */
+                if (buffer_unwritten(bh)) {
+                        if (!startio)
+                                continue;
+                        if (!iomp) {
+                                err = xfs_map_blocks(inode, offset, len, &iomap,
+                                                BMAPI_READ|BMAPI_IGNSTATE);
+                                if (err) {
+                                        goto error;
+                                }
+                                iomp = xfs_offset_to_map(page, &iomap,
+                                                                p_offset);
+                        }
+                        if (iomp) {
+                                if (!bh->b_end_io) {
+                                        err = xfs_map_unwritten(inode, page,
+                                                        head, bh, p_offset,
+                                                        inode->i_blkbits, iomp,
+                                                        wbc, startio, unmapped);
+                                        if (err) {
+                                                goto error;
+                                        }
+                                } else {
+                                        set_bit(BH_Lock, &bh->b_state);
+                                }
+                                BUG_ON(!buffer_locked(bh));
+                                bh_arr[cnt++] = bh;
+                                page_dirty--;
+                        }
+                /*
+                 * Second case, allocate space for a delalloc buffer.
+                 * We can return EAGAIN here in the release page case.
+                 */
+                } else if (buffer_delay(bh)) {
+                        if (!iomp) {
+                                delalloc = 1;
+                                err = xfs_map_blocks(inode, offset, len, &iomap,
+                                                BMAPI_ALLOCATE | flags);
+                                if (err) {
+                                        goto error;
+                                }
+                                iomp = xfs_offset_to_map(page, &iomap,
+                                                                p_offset);
+                        }
+                        if (iomp) {
+                                xfs_map_at_offset(page, bh, p_offset,
+                                                inode->i_blkbits, iomp);
+                                if (startio) {
+                                        bh_arr[cnt++] = bh;
+                                } else {
+                                        set_buffer_dirty(bh);
+                                        unlock_buffer(bh);
+                                        mark_buffer_dirty(bh);
+                                }
+                                page_dirty--;
+                        }
+                } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
+                           (unmapped || startio)) {
+                        if (!buffer_mapped(bh)) {
+                                int     size;
+                                /*
+                                 * Getting here implies an unmapped buffer
+                                 * was found, and we are in a path where we
+                                 * need to write the whole page out.
+                                 */
+                                if (!iomp) {
+                                        size = xfs_probe_unmapped_cluster(
+                                                        inode, page, bh, head);
+                                        err = xfs_map_blocks(inode, offset,
+                                                        size, &iomap,
+                                                        BMAPI_WRITE|BMAPI_MMAP);
+                                        if (err) {
+                                                goto error;
+                                        }
+                                        iomp = xfs_offset_to_map(page, &iomap,
+                                                                     p_offset);
+                                }
+                                if (iomp) {
+                                        xfs_map_at_offset(page,
+                                                        bh, p_offset,
+                                                        inode->i_blkbits, iomp);
+                                        if (startio) {
+                                                bh_arr[cnt++] = bh;
+                                        } else {
+                                                set_buffer_dirty(bh);
+                                                unlock_buffer(bh);
+                                                mark_buffer_dirty(bh);
+                                        }
+                                        page_dirty--;
+                                }
+                        } else if (startio) {
+                                if (buffer_uptodate(bh) &&
+                                    !test_and_set_bit(BH_Lock, &bh->b_state)) {
+                                        bh_arr[cnt++] = bh;
+                                        page_dirty--;
+                                }
+                        }
+                }
+        } while (offset += len, p_offset += len,
+                ((bh = bh->b_this_page) != head));
+        if (uptodate && bh == head)
+                SetPageUptodate(page);
+        if (startio)
+                xfs_submit_page(page, wbc, bh_arr, cnt, 0, 1);
+        if (iomp) {
+                tlast = (iomp->iomap_offset + iomp->iomap_bsize - 1) >>
+                                        PAGE_CACHE_SHIFT;
+                if (delalloc && (tlast > last_index))
+                        tlast = last_index;
+                xfs_cluster_write(inode, page->index + 1, iomp, wbc,
+                                        startio, unmapped, tlast);
+        }
+        return page_dirty;
+error:
+        for (i = 0; i < cnt; i++) {
+                unlock_buffer(bh_arr[i]);
+        }
+        /*
+         * If it's delalloc and we have nowhere to put it,
+         * throw it away, unless the lower layers told
+         * us to try again.
+         */
+        if (err != -EAGAIN) {
+                if (!unmapped) {
+                        block_invalidatepage(page, 0);
+                }
+                ClearPageUptodate(page);
+        }
+        return err;
+}
+STATIC int
+__linvfs_get_block(
+        struct inode            *inode,
+        sector_t                iblock,
+        unsigned long           blocks,
+        struct buffer_head      *bh_result,
+        int                     create,
+        int                     direct,
+        bmapi_flags_t           flags)
+{
+        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        xfs_iomap_t             iomap;
+        int                     retpbbm = 1;
+        int                     error;
+        ssize_t                 size;
+        loff_t                  offset = (loff_t)iblock << inode->i_blkbits;
+        if (blocks)
+                size = blocks << inode->i_blkbits;
+        else
+                size = 1 << inode->i_blkbits;
+        VOP_BMAP(vp, offset, size,
+                create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
+        if (error)
+                return -error;
+        if (retpbbm == 0)
+                return 0;
+        if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
+                xfs_daddr_t             bn;
+                loff_t                  delta;
+                /* For unwritten extents do not report a disk address on
+                 * the read case (treat as if we're reading into a hole).
+                 */
+                if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+                        delta = offset - iomap.iomap_offset;
+                        delta >>= inode->i_blkbits;
+                        bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT);
+                        bn += delta;
+                        BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME));
+                        bh_result->b_blocknr = bn;
+                        set_buffer_mapped(bh_result);
+                }
+                if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
+                        if (direct)
+                                bh_result->b_private = inode;
+                        set_buffer_unwritten(bh_result);
+                        set_buffer_delay(bh_result);
+                }
+        }
+        /* If this is a realtime file, data might be on a new device */
+        bh_result->b_bdev = iomap.iomap_target->pbr_bdev;
+        /* If we previously allocated a block out beyond eof and
+         * we are now coming back to use it then we will need to
+         * flag it as new even if it has a disk address.
+         */
+        if (create &&
+            ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+             (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) {
+                set_buffer_new(bh_result);
+        }
+        if (iomap.iomap_flags & IOMAP_DELAY) {
+                BUG_ON(direct);
+                if (create) {
+                        set_buffer_uptodate(bh_result);
+                        set_buffer_mapped(bh_result);
+                        set_buffer_delay(bh_result);
+                }
+        }
+        if (blocks) {
+                bh_result->b_size = (ssize_t)min(
+                        (loff_t)(iomap.iomap_bsize - iomap.iomap_delta),
+                        (loff_t)(blocks << inode->i_blkbits));
+        }
+        return 0;
+}
+int
+linvfs_get_block(
+        struct inode            *inode,
+        sector_t                iblock,
+        struct buffer_head      *bh_result,
+        int                     create)
+{
+        return __linvfs_get_block(inode, iblock, 0, bh_result,
+                                        create, 0, BMAPI_WRITE);
+}
+STATIC int
+linvfs_get_blocks_direct(
+        struct inode            *inode,
+        sector_t                iblock,
+        unsigned long           max_blocks,
+        struct buffer_head      *bh_result,
+        int                     create)
+{
+        return __linvfs_get_block(inode, iblock, max_blocks, bh_result,
+                                        create, 1, BMAPI_WRITE|BMAPI_DIRECT);
+}
+STATIC ssize_t
+linvfs_direct_IO(
+        int                     rw,
+        struct kiocb            *iocb,
+        const struct iovec      *iov,
+        loff_t                  offset,
+        unsigned long           nr_segs)
+{
+        struct file     *file = iocb->ki_filp;
+        struct inode    *inode = file->f_mapping->host;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        xfs_iomap_t     iomap;
+        int             maps = 1;
+        int             error;
+        VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
+        if (error)
+                return -error;
+        return blockdev_direct_IO_own_locking(rw, iocb, inode,
+                iomap.iomap_target->pbr_bdev,
+                iov, offset, nr_segs,
+                linvfs_get_blocks_direct,
+                linvfs_unwritten_convert_direct);
+}
+STATIC sector_t
+linvfs_bmap(
+        struct address_space    *mapping,
+        sector_t                block)
+{
+        struct inode            *inode = (struct inode *)mapping->host;
+        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        int                     error;
+        vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
+        VOP_RWLOCK(vp, VRWLOCK_READ);
+        VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+        VOP_RWUNLOCK(vp, VRWLOCK_READ);
+        return generic_block_bmap(mapping, block, linvfs_get_block);
+}
+STATIC int
+linvfs_readpage(
+        struct file             *unused,
+        struct page             *page)
+{
+        return mpage_readpage(page, linvfs_get_block);
+}
+STATIC int
+linvfs_readpages(
+        struct file             *unused,
+        struct address_space    *mapping,
+        struct list_head        *pages,
+        unsigned                nr_pages)
+{
+        return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
+}
+STATIC void
+xfs_count_page_state(
+        struct page             *page,
+        int                     *delalloc,
+        int                     *unmapped,
+        int                     *unwritten)
+{
+        struct buffer_head      *bh, *head;
+        *delalloc = *unmapped = *unwritten = 0;
+        bh = head = page_buffers(page);
+        do {
+                if (buffer_uptodate(bh) && !buffer_mapped(bh))
+                        (*unmapped) = 1;
+                else if (buffer_unwritten(bh) && !buffer_delay(bh))
+                        clear_buffer_unwritten(bh);
+                else if (buffer_unwritten(bh))
+                        (*unwritten) = 1;
+                else if (buffer_delay(bh))
+                        (*delalloc) = 1;
+        } while ((bh = bh->b_this_page) != head);
+}
+/*
+ * writepage: Called from one of two places:
+ *
+ * 1. we are flushing a delalloc buffer head.
+ *
+ * 2. we are writing out a dirty page. Typically the page dirty
+ *    state is cleared before we get here. In this case is it
+ *    conceivable we have no buffer heads.
+ *
+ * For delalloc space on the page we need to allocate space and
+ * flush it. For unmapped buffer heads on the page we should
+ * allocate space if the page is uptodate. For any other dirty
+ * buffer heads on the page we should flush them.
+ *
+ * If we detect that a transaction would be required to flush
+ * the page, we have to check the process flags first, if we
+ * are already in a transaction or disk I/O during allocations
+ * is off, we need to fail the writepage and redirty the page.
+ */
+STATIC int
+linvfs_writepage(
+        struct page             *page,
+        struct writeback_control *wbc)
+{
+        int                     error;
+        int                     need_trans;
+        int                     delalloc, unmapped, unwritten;
+        struct inode            *inode = page->mapping->host;
+        xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
+        /*
+         * We need a transaction if:
+         *  1. There are delalloc buffers on the page
+         *  2. The page is uptodate and we have unmapped buffers
+         *  3. The page is uptodate and we have no buffers
+         *  4. There are unwritten buffers on the page
+         */
+        if (!page_has_buffers(page)) {
+                unmapped = 1;
+                need_trans = 1;
+        } else {
+                xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
+                if (!PageUptodate(page))
+                        unmapped = 0;
+                need_trans = delalloc + unmapped + unwritten;
+        }
+        /*
+         * If we need a transaction and the process flags say
+         * we are already in a transaction, or no IO is allowed
+         * then mark the page dirty again and leave the page
+         * as is.
+         */
+        if (PFLAGS_TEST_FSTRANS() && need_trans)
+                goto out_fail;
+        /*
+         * Delay hooking up buffer heads until we have
+         * made our go/no-go decision.
+         */
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+        /*
+         * Convert delayed allocate, unwritten or unmapped space
+         * to real space and flush out to disk.
+         */
+        error = xfs_page_state_convert(inode, page, wbc, 1, unmapped);
+        if (error == -EAGAIN)
+                goto out_fail;
+        if (unlikely(error < 0))
+                goto out_unlock;
+        return 0;
+out_fail:
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        return 0;
+out_unlock:
+        unlock_page(page);
+        return error;
+}
+/*
+ * Called to move a page into cleanable state - and from there
+ * to be released. Possibly the page is already clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 0 if the page is ok to release, 1 otherwise.
+ *
+ * Possible scenarios are:
+ *
+ * 1. We are being called to release a page which has been written
+ *    to via regular I/O. buffer heads will be dirty and possibly
+ *    delalloc. If no delalloc buffer heads in this case then we
+ *    can just return zero.
+ *
+ * 2. We are called to release a page which has been written via
+ *    mmap, all we need to do is ensure there is no delalloc
+ *    state in the buffer heads, if not we can let the caller
+ *    free them and we should come back later via writepage.
+ */
+STATIC int
+linvfs_release_page(
+        struct page             *page,
+        int                     gfp_mask)
+{
+        struct inode            *inode = page->mapping->host;
+        int                     dirty, delalloc, unmapped, unwritten;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = 1,
+        };
+        xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
+        xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
+        if (!delalloc && !unwritten)
+                goto free_buffers;
+        if (!(gfp_mask & __GFP_FS))
+                return 0;
+        /* If we are already inside a transaction or the thread cannot
+         * do I/O, we cannot release this page.
+         */
+        if (PFLAGS_TEST_FSTRANS())
+                return 0;
+        /*
+         * Convert delalloc space to real space, do not flush the
+         * data out to disk, that will be done by the caller.
+         * Never need to allocate space here - we will always
+         * come back to writepage in that case.
+         */
+        dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0);
+        if (dirty == 0 && !unwritten)
+                goto free_buffers;
+        return 0;
+free_buffers:
+        return try_to_free_buffers(page);
+}
+STATIC int
+linvfs_prepare_write(
+        struct file             *file,
+        struct page             *page,
+        unsigned int            from,
+        unsigned int            to)
+{
+        return block_prepare_write(page, from, to, linvfs_get_block);
+}
+struct address_space_operations linvfs_aops = {
+        .readpage               = linvfs_readpage,
+        .readpages              = linvfs_readpages,
+        .writepage              = linvfs_writepage,
+        .sync_page              = block_sync_page,
+        .releasepage            = linvfs_release_page,
+        .prepare_write          = linvfs_prepare_write,
+        .commit_write           = generic_commit_write,
+        .bmap                   = linvfs_bmap,
+        .direct_IO              = linvfs_direct_IO,
+};
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
new file mode 100644
index 000000000000..23e0eb67fc25
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -0,0 +1,1980 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ *      The xfs_buf.c code provides an abstract buffer cache model on top
+ *      of the Linux page cache.  Cached metadata blocks for a file system
+ *      are hashed to the inode for the block device.  xfs_buf.c assembles
+ *      buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
+ *
+ *      Written by Steve Lord, Jim Mostek, Russell Cattelan
+ *                  and Rajagopal Ananthanarayanan ("ananth") at SGI.
+ *
+ */
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/percpu.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include "xfs_linux.h"
+/*
+ * File wide globals
+ */
+STATIC kmem_cache_t *pagebuf_cache;
+STATIC kmem_shaker_t pagebuf_shake;
+STATIC int pagebuf_daemon_wakeup(int, unsigned int);
+STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
+STATIC struct workqueue_struct *pagebuf_logio_workqueue;
+STATIC struct workqueue_struct *pagebuf_dataio_workqueue;
+/*
+ * Pagebuf debugging
+ */
+#ifdef PAGEBUF_TRACE
+void
+pagebuf_trace(
+        xfs_buf_t       *pb,
+        char            *id,
+        void            *data,
+        void            *ra)
+{
+        ktrace_enter(pagebuf_trace_buf,
+                pb, id,
+                (void *)(unsigned long)pb->pb_flags,
+                (void *)(unsigned long)pb->pb_hold.counter,
+                (void *)(unsigned long)pb->pb_sema.count.counter,
+                (void *)current,
+                data, ra,
+                (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
+                (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
+                (void *)(unsigned long)pb->pb_buffer_length,
+                NULL, NULL, NULL, NULL, NULL);
+}
+ktrace_t *pagebuf_trace_buf;
+#define PAGEBUF_TRACE_SIZE      4096
+#define PB_TRACE(pb, id, data)  \
+        pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
+#else
+#define PB_TRACE(pb, id, data)  do { } while (0)
+#endif
+#ifdef PAGEBUF_LOCK_TRACKING
+# define PB_SET_OWNER(pb)       ((pb)->pb_last_holder = current->pid)
+# define PB_CLEAR_OWNER(pb)     ((pb)->pb_last_holder = -1)
+# define PB_GET_OWNER(pb)       ((pb)->pb_last_holder)
+#else
+# define PB_SET_OWNER(pb)       do { } while (0)
+# define PB_CLEAR_OWNER(pb)     do { } while (0)
+# define PB_GET_OWNER(pb)       do { } while (0)
+#endif
+/*
+ * Pagebuf allocation / freeing.
+ */
+#define pb_to_gfp(flags) \
+        ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \
+          ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
+#define pb_to_km(flags) \
+         (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
+#define pagebuf_allocate(flags) \
+        kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))
+#define pagebuf_deallocate(pb) \
+        kmem_zone_free(pagebuf_cache, (pb));
+/*
+ * Page Region interfaces.
+ *
+ * For pages in filesystems where the blocksize is smaller than the
+ * pagesize, we use the page->private field (long) to hold a bitmap
+ * of uptodate regions within the page.
+ *
+ * Each such region is "bytes per page / bits per long" bytes long.
+ *
+ * NBPPR == number-of-bytes-per-page-region
+ * BTOPR == bytes-to-page-region (rounded up)
+ * BTOPRT == bytes-to-page-region-truncated (rounded down)
+ */
+#if (BITS_PER_LONG == 32)
+#define PRSHIFT         (PAGE_CACHE_SHIFT - 5)  /* (32 == 1<<5) */
+#elif (BITS_PER_LONG == 64)
+#define PRSHIFT         (PAGE_CACHE_SHIFT - 6)  /* (64 == 1<<6) */
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+#define NBPPR           (PAGE_CACHE_SIZE/BITS_PER_LONG)
+#define BTOPR(b)        (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
+#define BTOPRT(b)       (((unsigned int)(b) >> PRSHIFT))
+STATIC unsigned long
+page_region_mask(
+        size_t          offset,
+        size_t          length)
+{
+        unsigned long   mask;
+        int             first, final;
+        first = BTOPR(offset);
+        final = BTOPRT(offset + length - 1);
+        first = min(first, final);
+        mask = ~0UL;
+        mask <<= BITS_PER_LONG - (final - first);
+        mask >>= BITS_PER_LONG - (final);
+        ASSERT(offset + length <= PAGE_CACHE_SIZE);
+        ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
+        return mask;
+}
+STATIC inline void
+set_page_region(
+        struct page     *page,
+        size_t          offset,
+        size_t          length)
+{
+        page->private |= page_region_mask(offset, length);
+        if (page->private == ~0UL)
+                SetPageUptodate(page);
+}
+STATIC inline int
+test_page_region(
+        struct page     *page,
+        size_t          offset,
+        size_t          length)
+{
+        unsigned long   mask = page_region_mask(offset, length);
+        return (mask && (page->private & mask) == mask);
+}
+/*
+ * Mapping of multi-page buffers into contiguous virtual space
+ */
+typedef struct a_list {
+        void            *vm_addr;
+        struct a_list   *next;
+} a_list_t;
+STATIC a_list_t         *as_free_head;
+STATIC int              as_list_len;
+STATIC DEFINE_SPINLOCK(as_lock);
+/*
+ * Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+        void            *addr)
+{
+        a_list_t        *aentry;
+        aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
+        if (likely(aentry)) {
+                spin_lock(&as_lock);
+                aentry->next = as_free_head;
+                aentry->vm_addr = addr;
+                as_free_head = aentry;
+                as_list_len++;
+                spin_unlock(&as_lock);
+        } else {
+                vunmap(addr);
+        }
+}
+STATIC void
+purge_addresses(void)
+{
+        a_list_t        *aentry, *old;
+        if (as_free_head == NULL)
+                return;
+        spin_lock(&as_lock);
+        aentry = as_free_head;
+        as_free_head = NULL;
+        as_list_len = 0;
+        spin_unlock(&as_lock);
+        while ((old = aentry) != NULL) {
+                vunmap(aentry->vm_addr);
+                aentry = aentry->next;
+                kfree(old);
+        }
+}
+/*
+ *      Internal pagebuf object manipulation
+ */
+STATIC void
+_pagebuf_initialize(
+        xfs_buf_t               *pb,
+        xfs_buftarg_t           *target,
+        loff_t                  range_base,
+        size_t                  range_length,
+        page_buf_flags_t        flags)
+{
+        /*
+         * We don't want certain flags to appear in pb->pb_flags.
+         */
+        flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
+        memset(pb, 0, sizeof(xfs_buf_t));
+        atomic_set(&pb->pb_hold, 1);
+        init_MUTEX_LOCKED(&pb->pb_iodonesema);
+        INIT_LIST_HEAD(&pb->pb_list);
+        INIT_LIST_HEAD(&pb->pb_hash_list);
+        init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
+        PB_SET_OWNER(pb);
+        pb->pb_target = target;
+        pb->pb_file_offset = range_base;
+        /*
+         * Set buffer_length and count_desired to the same value initially.
+         * I/O routines should use count_desired, which will be the same in
+         * most cases but may be reset (e.g. XFS recovery).
+         */
+        pb->pb_buffer_length = pb->pb_count_desired = range_length;
+        pb->pb_flags = flags | PBF_NONE;
+        pb->pb_bn = XFS_BUF_DADDR_NULL;
+        atomic_set(&pb->pb_pin_count, 0);
+        init_waitqueue_head(&pb->pb_waiters);
+        XFS_STATS_INC(pb_create);
+        PB_TRACE(pb, "initialize", target);
+}
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_pagebuf_get_pages(
+        xfs_buf_t               *pb,
+        int                     page_count,
+        page_buf_flags_t        flags)
+{
+        /* Make sure that we have a page list */
+        if (pb->pb_pages == NULL) {
+                pb->pb_offset = page_buf_poff(pb->pb_file_offset);
+                pb->pb_page_count = page_count;
+                if (page_count <= PB_PAGES) {
+                        pb->pb_pages = pb->pb_page_array;
+                } else {
+                        pb->pb_pages = kmem_alloc(sizeof(struct page *) *
+                                        page_count, pb_to_km(flags));
+                        if (pb->pb_pages == NULL)
+                                return -ENOMEM;
+                }
+                memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
+        }
+        return 0;
+}
+/*
+ *      Frees pb_pages if it was malloced.
+ */
+STATIC void
+_pagebuf_free_pages(
+        xfs_buf_t       *bp)
+{
+        if (bp->pb_pages != bp->pb_page_array) {
+                kmem_free(bp->pb_pages,
+                          bp->pb_page_count * sizeof(struct page *));
+        }
+}
+/*
+ *      Releases the specified buffer.
+ *
+ *      The modification state of any associated pages is left unchanged.
+ *      The buffer most not be on any hash - use pagebuf_rele instead for
+ *      hashed and refcounted buffers
+ */
+void
+pagebuf_free(
+        xfs_buf_t               *bp)
+{
+        PB_TRACE(bp, "free", 0);
+        ASSERT(list_empty(&bp->pb_hash_list));
+        if (bp->pb_flags & _PBF_PAGE_CACHE) {
+                uint            i;
+                if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
+                        free_address(bp->pb_addr - bp->pb_offset);
+                for (i = 0; i < bp->pb_page_count; i++)
+                        page_cache_release(bp->pb_pages[i]);
+                _pagebuf_free_pages(bp);
+        } else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
+                 /*
+                  * XXX(hch): bp->pb_count_desired might be incorrect (see
+                  * pagebuf_associate_memory for details), but fortunately
+                  * the Linux version of kmem_free ignores the len argument..
+                  */
+                kmem_free(bp->pb_addr, bp->pb_count_desired);
+                _pagebuf_free_pages(bp);
+        }
+        pagebuf_deallocate(bp);
+}
+/*
+ *      Finds all pages for buffer in question and builds it's page list.
+ */
+STATIC int
+_pagebuf_lookup_pages(
+        xfs_buf_t               *bp,
+        uint                    flags)
+{
+        struct address_space    *mapping = bp->pb_target->pbr_mapping;
+        size_t                  blocksize = bp->pb_target->pbr_bsize;
+        size_t                  size = bp->pb_count_desired;
+        size_t                  nbytes, offset;
+        int                     gfp_mask = pb_to_gfp(flags);
+        unsigned short          page_count, i;
+        pgoff_t                 first;
+        loff_t                  end;
+        int                     error;
+        end = bp->pb_file_offset + bp->pb_buffer_length;
+        page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
+        error = _pagebuf_get_pages(bp, page_count, flags);
+        if (unlikely(error))
+                return error;
+        bp->pb_flags |= _PBF_PAGE_CACHE;
+        offset = bp->pb_offset;
+        first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
+        for (i = 0; i < bp->pb_page_count; i++) {
+                struct page     *page;
+                uint            retries = 0;
+              retry:
+                page = find_or_create_page(mapping, first + i, gfp_mask);
+                if (unlikely(page == NULL)) {
+                        if (flags & PBF_READ_AHEAD) {
+                                bp->pb_page_count = i;
+                                for (i = 0; i < bp->pb_page_count; i++)
+                                        unlock_page(bp->pb_pages[i]);
+                                return -ENOMEM;
+                        }
+                        /*
+                         * This could deadlock.
+                         *
+                         * But until all the XFS lowlevel code is revamped to
+                         * handle buffer allocation failures we can't do much.
+                         */
+                        if (!(++retries % 100))
+                                printk(KERN_ERR
+                                        "XFS: possible memory allocation "
+                                        "deadlock in %s (mode:0x%x)\n",
+                                        __FUNCTION__, gfp_mask);
+                        XFS_STATS_INC(pb_page_retries);
+                        pagebuf_daemon_wakeup(0, gfp_mask);
+                        blk_congestion_wait(WRITE, HZ/50);
+                        goto retry;
+                }
+                XFS_STATS_INC(pb_page_found);
+                nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
+                size -= nbytes;
+                if (!PageUptodate(page)) {
+                        page_count--;
+                        if (blocksize >= PAGE_CACHE_SIZE) {
+                                if (flags & PBF_READ)
+                                        bp->pb_locked = 1;
+                        } else if (!PagePrivate(page)) {
+                                if (test_page_region(page, offset, nbytes))
+                                        page_count++;
+                        }
+                }
+                bp->pb_pages[i] = page;
+                offset = 0;
+        }
+        if (!bp->pb_locked) {
+                for (i = 0; i < bp->pb_page_count; i++)
+                        unlock_page(bp->pb_pages[i]);
+        }
+        if (page_count) {
+                /* if we have any uptodate pages, mark that in the buffer */
+                bp->pb_flags &= ~PBF_NONE;
+                /* if some pages aren't uptodate, mark that in the buffer */
+                if (page_count != bp->pb_page_count)
+                        bp->pb_flags |= PBF_PARTIAL;
+        }
+        PB_TRACE(bp, "lookup_pages", (long)page_count);
+        return error;
+}
+/*
+ *      Map buffer into kernel address-space if nessecary.
+ */
+STATIC int
+_pagebuf_map_pages(
+        xfs_buf_t               *bp,
+        uint                    flags)
+{
+        /* A single page buffer is always mappable */
+        if (bp->pb_page_count == 1) {
+                bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
+                bp->pb_flags |= PBF_MAPPED;
+        } else if (flags & PBF_MAPPED) {
+                if (as_list_len > 64)
+                        purge_addresses();
+                bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
+                                VM_MAP, PAGE_KERNEL);
+                if (unlikely(bp->pb_addr == NULL))
+                        return -ENOMEM;
+                bp->pb_addr += bp->pb_offset;
+                bp->pb_flags |= PBF_MAPPED;
+        }
+        return 0;
+}
+/*
+ *      Finding and Reading Buffers
+ */
+/*
+ *      _pagebuf_find
+ *
+ *      Looks up, and creates if absent, a lockable buffer for
+ *      a given range of an inode.  The buffer is returned
+ *      locked.  If other overlapping buffers exist, they are
+ *      released before the new buffer is created and locked,
+ *      which may imply that this call will block until those buffers
+ *      are unlocked.  No I/O is implied by this call.
+ */
+xfs_buf_t *
+_pagebuf_find(
+        xfs_buftarg_t           *btp,   /* block device target          */
+        loff_t                  ioff,   /* starting offset of range     */
+        size_t                  isize,  /* length of range              */
+        page_buf_flags_t        flags,  /* PBF_TRYLOCK                  */
+        xfs_buf_t               *new_pb)/* newly allocated buffer       */
+{
+        loff_t                  range_base;
+        size_t                  range_length;
+        xfs_bufhash_t           *hash;
+        xfs_buf_t               *pb, *n;
+        range_base = (ioff << BBSHIFT);
+        range_length = (isize << BBSHIFT);
+        /* Check for IOs smaller than the sector size / not sector aligned */
+        ASSERT(!(range_length < (1 << btp->pbr_sshift)));
+        ASSERT(!(range_base & (loff_t)btp->pbr_smask));
+        hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
+        spin_lock(&hash->bh_lock);
+        list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
+                ASSERT(btp == pb->pb_target);
+                if (pb->pb_file_offset == range_base &&
+                    pb->pb_buffer_length == range_length) {
+                        /*
+                         * If we look at something bring it to the
+                         * front of the list for next time.
+                         */
+                        atomic_inc(&pb->pb_hold);
+                        list_move(&pb->pb_hash_list, &hash->bh_list);
+                        goto found;
+                }
+        }
+        /* No match found */
+        if (new_pb) {
+                _pagebuf_initialize(new_pb, btp, range_base,
+                                range_length, flags);
+                new_pb->pb_hash = hash;
+                list_add(&new_pb->pb_hash_list, &hash->bh_list);
+        } else {
+                XFS_STATS_INC(pb_miss_locked);
+        }
+        spin_unlock(&hash->bh_lock);
+        return new_pb;
+found:
+        spin_unlock(&hash->bh_lock);
+        /* Attempt to get the semaphore without sleeping,
+         * if this does not work then we need to drop the
+         * spinlock and do a hard attempt on the semaphore.
+         */
+        if (down_trylock(&pb->pb_sema)) {
+                if (!(flags & PBF_TRYLOCK)) {
+                        /* wait for buffer ownership */
+                        PB_TRACE(pb, "get_lock", 0);
+                        pagebuf_lock(pb);
+                        XFS_STATS_INC(pb_get_locked_waited);
+                } else {
+                        /* We asked for a trylock and failed, no need
+                         * to look at file offset and length here, we
+                         * know that this pagebuf at least overlaps our
+                         * pagebuf and is locked, therefore our buffer
+                         * either does not exist, or is this buffer
+                         */
+                        pagebuf_rele(pb);
+                        XFS_STATS_INC(pb_busy_locked);
+                        return (NULL);
+                }
+        } else {
+                /* trylock worked */
+                PB_SET_OWNER(pb);
+        }
+        if (pb->pb_flags & PBF_STALE)
+                pb->pb_flags &= PBF_MAPPED;
+        PB_TRACE(pb, "got_lock", 0);
+        XFS_STATS_INC(pb_get_locked);
+        return (pb);
+}
+/*
+ *      xfs_buf_get_flags assembles a buffer covering the specified range.
+ *
+ *      Storage in memory for all portions of the buffer will be allocated,
+ *      although backing storage may not be.
+ */
+xfs_buf_t *
+xfs_buf_get_flags(                      /* allocate a buffer            */
+        xfs_buftarg_t           *target,/* target for buffer            */
+        loff_t                  ioff,   /* starting offset of range     */
+        size_t                  isize,  /* length of range              */
+        page_buf_flags_t        flags)  /* PBF_TRYLOCK                  */
+{
+        xfs_buf_t               *pb, *new_pb;
+        int                     error = 0, i;
+        new_pb = pagebuf_allocate(flags);
+        if (unlikely(!new_pb))
+                return NULL;
+        pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
+        if (pb == new_pb) {
+                error = _pagebuf_lookup_pages(pb, flags);
+                if (error)
+                        goto no_buffer;
+        } else {
+                pagebuf_deallocate(new_pb);
+                if (unlikely(pb == NULL))
+                        return NULL;
+        }
+        for (i = 0; i < pb->pb_page_count; i++)
+                mark_page_accessed(pb->pb_pages[i]);
+        if (!(pb->pb_flags & PBF_MAPPED)) {
+                error = _pagebuf_map_pages(pb, flags);
+                if (unlikely(error)) {
+                        printk(KERN_WARNING "%s: failed to map pages\n",
+                                        __FUNCTION__);
+                        goto no_buffer;
+                }
+        }
+        XFS_STATS_INC(pb_get);
+        /*
+         * Always fill in the block number now, the mapped cases can do
+         * their own overlay of this later.
+         */
+        pb->pb_bn = ioff;
+        pb->pb_count_desired = pb->pb_buffer_length;
+        PB_TRACE(pb, "get", (unsigned long)flags);
+        return pb;
+ no_buffer:
+        if (flags & (PBF_LOCK | PBF_TRYLOCK))
+                pagebuf_unlock(pb);
+        pagebuf_rele(pb);
+        return NULL;
+}
+xfs_buf_t *
+xfs_buf_read_flags(
+        xfs_buftarg_t           *target,
+        loff_t                  ioff,
+        size_t                  isize,
+        page_buf_flags_t        flags)
+{
+        xfs_buf_t               *pb;
+        flags |= PBF_READ;
+        pb = xfs_buf_get_flags(target, ioff, isize, flags);
+        if (pb) {
+                if (PBF_NOT_DONE(pb)) {
+                        PB_TRACE(pb, "read", (unsigned long)flags);
+                        XFS_STATS_INC(pb_get_read);
+                        pagebuf_iostart(pb, flags);
+                } else if (flags & PBF_ASYNC) {
+                        PB_TRACE(pb, "read_async", (unsigned long)flags);
+                        /*
+                         * Read ahead call which is already satisfied,
+                         * drop the buffer
+                         */
+                        goto no_buffer;
+                } else {
+                        PB_TRACE(pb, "read_done", (unsigned long)flags);
+                        /* We do not want read in the flags */
+                        pb->pb_flags &= ~PBF_READ;
+                }
+        }
+        return pb;
+ no_buffer:
+        if (flags & (PBF_LOCK | PBF_TRYLOCK))
+                pagebuf_unlock(pb);
+        pagebuf_rele(pb);
+        return NULL;
+}
+/*
+ * Create a skeletal pagebuf (no pages associated with it).
+ */
+xfs_buf_t *
+pagebuf_lookup(
+        xfs_buftarg_t           *target,
+        loff_t                  ioff,
+        size_t                  isize,
+        page_buf_flags_t        flags)
+{
+        xfs_buf_t               *pb;
+        pb = pagebuf_allocate(flags);
+        if (pb) {
+                _pagebuf_initialize(pb, target, ioff, isize, flags);
+        }
+        return pb;
+}
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+pagebuf_readahead(
+        xfs_buftarg_t           *target,
+        loff_t                  ioff,
+        size_t                  isize,
+        page_buf_flags_t        flags)
+{
+        struct backing_dev_info *bdi;
+        bdi = target->pbr_mapping->backing_dev_info;
+        if (bdi_read_congested(bdi))
+                return;
+        flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD);
+        xfs_buf_read_flags(target, ioff, isize, flags);
+}
+xfs_buf_t *
+pagebuf_get_empty(
+        size_t                  len,
+        xfs_buftarg_t           *target)
+{
+        xfs_buf_t               *pb;
+        pb = pagebuf_allocate(0);
+        if (pb)
+                _pagebuf_initialize(pb, target, 0, len, 0);
+        return pb;
+}
+static inline struct page *
+mem_to_page(
+        void                    *addr)
+{
+        if (((unsigned long)addr < VMALLOC_START) ||
+            ((unsigned long)addr >= VMALLOC_END)) {
+                return virt_to_page(addr);
+        } else {
+                return vmalloc_to_page(addr);
+        }
+}
+int
+pagebuf_associate_memory(
+        xfs_buf_t               *pb,
+        void                    *mem,
+        size_t                  len)
+{
+        int                     rval;
+        int                     i = 0;
+        size_t                  ptr;
+        size_t                  end, end_cur;
+        off_t                   offset;
+        int                     page_count;
+        page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+        offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
+        if (offset && (len > PAGE_CACHE_SIZE))
+                page_count++;
+        /* Free any previous set of page pointers */
+        if (pb->pb_pages)
+                _pagebuf_free_pages(pb);
+        pb->pb_pages = NULL;
+        pb->pb_addr = mem;
+        rval = _pagebuf_get_pages(pb, page_count, 0);
+        if (rval)
+                return rval;
+        pb->pb_offset = offset;
+        ptr = (size_t) mem & PAGE_CACHE_MASK;
+        end = PAGE_CACHE_ALIGN((size_t) mem + len);
+        end_cur = end;
+        /* set up first page */
+        pb->pb_pages[0] = mem_to_page(mem);
+        ptr += PAGE_CACHE_SIZE;
+        pb->pb_page_count = ++i;
+        while (ptr < end) {
+                pb->pb_pages[i] = mem_to_page((void *)ptr);
+                pb->pb_page_count = ++i;
+                ptr += PAGE_CACHE_SIZE;
+        }
+        pb->pb_locked = 0;
+        pb->pb_count_desired = pb->pb_buffer_length = len;
+        pb->pb_flags |= PBF_MAPPED;
+        return 0;
+}
+xfs_buf_t *
+pagebuf_get_no_daddr(
+        size_t                  len,
+        xfs_buftarg_t           *target)
+{
+        size_t                  malloc_len = len;
+        xfs_buf_t               *bp;
+        void                    *data;
+        int                     error;
+        bp = pagebuf_allocate(0);
+        if (unlikely(bp == NULL))
+                goto fail;
+        _pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
+ try_again:
+        data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
+        if (unlikely(data == NULL))
+                goto fail_free_buf;
+        /* check whether alignment matches.. */
+        if ((__psunsigned_t)data !=
+            ((__psunsigned_t)data & ~target->pbr_smask)) {
+                /* .. else double the size and try again */
+                kmem_free(data, malloc_len);
+                malloc_len <<= 1;
+                goto try_again;
+        }
+        error = pagebuf_associate_memory(bp, data, len);
+        if (error)
+                goto fail_free_mem;
+        bp->pb_flags |= _PBF_KMEM_ALLOC;
+        pagebuf_unlock(bp);
+        PB_TRACE(bp, "no_daddr", data);
+        return bp;
+ fail_free_mem:
+        kmem_free(data, malloc_len);
+ fail_free_buf:
+        pagebuf_free(bp);
+ fail:
+        return NULL;
+}
+/*
+ *      pagebuf_hold
+ *
+ *      Increment reference count on buffer, to hold the buffer concurrently
+ *      with another thread which may release (free) the buffer asynchronously.
+ *
+ *      Must hold the buffer already to call this function.
+ */
+void
+pagebuf_hold(
+        xfs_buf_t               *pb)
+{
+        atomic_inc(&pb->pb_hold);
+        PB_TRACE(pb, "hold", 0);
+}
+/*
+ *      pagebuf_rele
+ *
+ *      pagebuf_rele releases a hold on the specified buffer.  If the
+ *      the hold count is 1, pagebuf_rele calls pagebuf_free.
+ */
+void
+pagebuf_rele(
+        xfs_buf_t               *pb)
+{
+        xfs_bufhash_t           *hash = pb->pb_hash;
+        PB_TRACE(pb, "rele", pb->pb_relse);
+        /*
+         * pagebuf_lookup buffers are not hashed, not delayed write,
+         * and don't have their own release routines.  Special case.
+         */
+        if (unlikely(!hash)) {
+                ASSERT(!pb->pb_relse);
+                if (atomic_dec_and_test(&pb->pb_hold))
+                        xfs_buf_free(pb);
+                return;
+        }
+        if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
+                int             do_free = 1;
+                if (pb->pb_relse) {
+                        atomic_inc(&pb->pb_hold);
+                        spin_unlock(&hash->bh_lock);
+                        (*(pb->pb_relse)) (pb);
+                        spin_lock(&hash->bh_lock);
+                        do_free = 0;
+                }
+                if (pb->pb_flags & PBF_DELWRI) {
+                        pb->pb_flags |= PBF_ASYNC;
+                        atomic_inc(&pb->pb_hold);
+                        pagebuf_delwri_queue(pb, 0);
+                        do_free = 0;
+                } else if (pb->pb_flags & PBF_FS_MANAGED) {
+                        do_free = 0;
+                }
+                if (do_free) {
+                        list_del_init(&pb->pb_hash_list);
+                        spin_unlock(&hash->bh_lock);
+                        pagebuf_free(pb);
+                } else {
+                        spin_unlock(&hash->bh_lock);
+                }
+        }
+}
+/*
+ *      Mutual exclusion on buffers.  Locking model:
+ *
+ *      Buffers associated with inodes for which buffer locking
+ *      is not enabled are not protected by semaphores, and are
+ *      assumed to be exclusively owned by the caller.  There is a
+ *      spinlock in the buffer, used by the caller when concurrent
+ *      access is possible.
+ */
+/*
+ *      pagebuf_cond_lock
+ *
+ *      pagebuf_cond_lock locks a buffer object, if it is not already locked.
+ *      Note that this in no way
+ *      locks the underlying pages, so it is only useful for synchronizing
+ *      concurrent use of page buffer objects, not for synchronizing independent
+ *      access to the underlying pages.
+ */
+int
+pagebuf_cond_lock(                      /* lock buffer, if not locked   */
+                                        /* returns -EBUSY if locked)    */
+        xfs_buf_t               *pb)
+{
+        int                     locked;
+        locked = down_trylock(&pb->pb_sema) == 0;
+        if (locked) {
+                PB_SET_OWNER(pb);
+        }
+        PB_TRACE(pb, "cond_lock", (long)locked);
+        return(locked ? 0 : -EBUSY);
+}
+#if defined(DEBUG) || defined(XFS_BLI_TRACE)
+/*
+ *      pagebuf_lock_value
+ *
+ *      Return lock value for a pagebuf
+ */
+int
+pagebuf_lock_value(
+        xfs_buf_t               *pb)
+{
+        return(atomic_read(&pb->pb_sema.count));
+}
+#endif
+/*
+ *      pagebuf_lock
+ *
+ *      pagebuf_lock locks a buffer object.  Note that this in no way
+ *      locks the underlying pages, so it is only useful for synchronizing
+ *      concurrent use of page buffer objects, not for synchronizing independent
+ *      access to the underlying pages.
+ */
+int
+pagebuf_lock(
+        xfs_buf_t               *pb)
+{
+        PB_TRACE(pb, "lock", 0);
+        if (atomic_read(&pb->pb_io_remaining))
+                blk_run_address_space(pb->pb_target->pbr_mapping);
+        down(&pb->pb_sema);
+        PB_SET_OWNER(pb);
+        PB_TRACE(pb, "locked", 0);
+        return 0;
+}
+/*
+ *      pagebuf_unlock
+ *
+ *      pagebuf_unlock releases the lock on the buffer object created by
+ *      pagebuf_lock or pagebuf_cond_lock (not any
+ *      pinning of underlying pages created by pagebuf_pin).
+ */
+void
+pagebuf_unlock(                         /* unlock buffer                */
+        xfs_buf_t               *pb)    /* buffer to unlock             */
+{
+        PB_CLEAR_OWNER(pb);
+        up(&pb->pb_sema);
+        PB_TRACE(pb, "unlock", 0);
+}
+/*
+ *      Pinning Buffer Storage in Memory
+ */
+/*
+ *      pagebuf_pin
+ *
+ *      pagebuf_pin locks all of the memory represented by a buffer in
+ *      memory.  Multiple calls to pagebuf_pin and pagebuf_unpin, for
+ *      the same or different buffers affecting a given page, will
+ *      properly count the number of outstanding "pin" requests.  The
+ *      buffer may be released after the pagebuf_pin and a different
+ *      buffer used when calling pagebuf_unpin, if desired.
+ *      pagebuf_pin should be used by the file system when it wants be
+ *      assured that no attempt will be made to force the affected
+ *      memory to disk.  It does not assure that a given logical page
+ *      will not be moved to a different physical page.
+ */
+void
+pagebuf_pin(
+        xfs_buf_t               *pb)
+{
+        atomic_inc(&pb->pb_pin_count);
+        PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
+}
+/*
+ *      pagebuf_unpin
+ *
+ *      pagebuf_unpin reverses the locking of memory performed by
+ *      pagebuf_pin.  Note that both functions affected the logical
+ *      pages associated with the buffer, not the buffer itself.
+ */
+void
+pagebuf_unpin(
+        xfs_buf_t               *pb)
+{
+        if (atomic_dec_and_test(&pb->pb_pin_count)) {
+                wake_up_all(&pb->pb_waiters);
+        }
+        PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
+}
+int
+pagebuf_ispin(
+        xfs_buf_t               *pb)
+{
+        return atomic_read(&pb->pb_pin_count);
+}
+/*
+ *      pagebuf_wait_unpin
+ *
+ *      pagebuf_wait_unpin waits until all of the memory associated
+ *      with the buffer is not longer locked in memory.  It returns
+ *      immediately if none of the affected pages are locked.
+ */
+static inline void
+_pagebuf_wait_unpin(
+        xfs_buf_t               *pb)
+{
+        DECLARE_WAITQUEUE       (wait, current);
+        if (atomic_read(&pb->pb_pin_count) == 0)
+                return;
+        add_wait_queue(&pb->pb_waiters, &wait);
+        for (;;) {
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                if (atomic_read(&pb->pb_pin_count) == 0)
+                        break;
+                if (atomic_read(&pb->pb_io_remaining))
+                        blk_run_address_space(pb->pb_target->pbr_mapping);
+                schedule();
+        }
+        remove_wait_queue(&pb->pb_waiters, &wait);
+        set_current_state(TASK_RUNNING);
+}
+/*
+ *      Buffer Utility Routines
+ */
+/*
+ *      pagebuf_iodone
+ *
+ *      pagebuf_iodone marks a buffer for which I/O is in progress
+ *      done with respect to that I/O.  The pb_iodone routine, if
+ *      present, will be called as a side-effect.
+ */
+STATIC void
+pagebuf_iodone_work(
+        void                    *v)
+{
+        xfs_buf_t               *bp = (xfs_buf_t *)v;
+        if (bp->pb_iodone)
+                (*(bp->pb_iodone))(bp);
+        else if (bp->pb_flags & PBF_ASYNC)
+                xfs_buf_relse(bp);
+}
+void
+pagebuf_iodone(
+        xfs_buf_t               *pb,
+        int                     dataio,
+        int                     schedule)
+{
+        pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
+        if (pb->pb_error == 0) {
+                pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
+        }
+        PB_TRACE(pb, "iodone", pb->pb_iodone);
+        if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
+                if (schedule) {
+                        INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb);
+                        queue_work(dataio ? pagebuf_dataio_workqueue :
+                                pagebuf_logio_workqueue, &pb->pb_iodone_work);
+                } else {
+                        pagebuf_iodone_work(pb);
+                }
+        } else {
+                up(&pb->pb_iodonesema);
+        }
+}
+/*
+ *      pagebuf_ioerror
+ *
+ *      pagebuf_ioerror sets the error code for a buffer.
+ */
+void
+pagebuf_ioerror(                        /* mark/clear buffer error flag */
+        xfs_buf_t               *pb,    /* buffer to mark               */
+        int                     error)  /* error to store (0 if none)   */
+{
+        ASSERT(error >= 0 && error <= 0xffff);
+        pb->pb_error = (unsigned short)error;
+        PB_TRACE(pb, "ioerror", (unsigned long)error);
+}
+/*
+ *      pagebuf_iostart
+ *
+ *      pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
+ *      If necessary, it will arrange for any disk space allocation required,
+ *      and it will break up the request if the block mappings require it.
+ *      The pb_iodone routine in the buffer supplied will only be called
+ *      when all of the subsidiary I/O requests, if any, have been completed.
+ *      pagebuf_iostart calls the pagebuf_ioinitiate routine or
+ *      pagebuf_iorequest, if the former routine is not defined, to start
+ *      the I/O on a given low-level request.
+ */
+int
+pagebuf_iostart(                        /* start I/O on a buffer          */
+        xfs_buf_t               *pb,    /* buffer to start                */
+        page_buf_flags_t        flags)  /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+                                        /* PBF_WRITE, PBF_DELWRI,         */
+                                        /* PBF_DONT_BLOCK                 */
+{
+        int                     status = 0;
+        PB_TRACE(pb, "iostart", (unsigned long)flags);
+        if (flags & PBF_DELWRI) {
+                pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
+                pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC);
+                pagebuf_delwri_queue(pb, 1);
+                return status;
+        }
+        pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \
+                        PBF_READ_AHEAD | _PBF_RUN_QUEUES);
+        pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
+                        PBF_READ_AHEAD | _PBF_RUN_QUEUES);
+        BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
+        /* For writes allow an alternate strategy routine to precede
+         * the actual I/O request (which may not be issued at all in
+         * a shutdown situation, for example).
+         */
+        status = (flags & PBF_WRITE) ?
+                pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
+        /* Wait for I/O if we are not an async request.
+         * Note: async I/O request completion will release the buffer,
+         * and that can already be done by this point.  So using the
+         * buffer pointer from here on, after async I/O, is invalid.
+         */
+        if (!status && !(flags & PBF_ASYNC))
+                status = pagebuf_iowait(pb);
+        return status;
+}
+/*
+ * Helper routine for pagebuf_iorequest
+ */
+STATIC __inline__ int
+_pagebuf_iolocked(
+        xfs_buf_t               *pb)
+{
+        ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
+        if (pb->pb_flags & PBF_READ)
+                return pb->pb_locked;
+        return 0;
+}
+STATIC __inline__ void
+_pagebuf_iodone(
+        xfs_buf_t               *pb,
+        int                     schedule)
+{
+        if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
+                pb->pb_locked = 0;
+                pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
+        }
+}
+STATIC int
+bio_end_io_pagebuf(
+        struct bio              *bio,
+        unsigned int            bytes_done,
+        int                     error)
+{
+        xfs_buf_t               *pb = (xfs_buf_t *)bio->bi_private;
+        unsigned int            i, blocksize = pb->pb_target->pbr_bsize;
+        struct bio_vec          *bvec = bio->bi_io_vec;
+        if (bio->bi_size)
+                return 1;
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+                pb->pb_error = EIO;
+        for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
+                struct page     *page = bvec->bv_page;
+                if (pb->pb_error) {
+                        SetPageError(page);
+                } else if (blocksize == PAGE_CACHE_SIZE) {
+                        SetPageUptodate(page);
+                } else if (!PagePrivate(page) &&
+                                (pb->pb_flags & _PBF_PAGE_CACHE)) {
+                        set_page_region(page, bvec->bv_offset, bvec->bv_len);
+                }
+                if (_pagebuf_iolocked(pb)) {
+                        unlock_page(page);
+                }
+        }
+        _pagebuf_iodone(pb, 1);
+        bio_put(bio);
+        return 0;
+}
+STATIC void
+_pagebuf_ioapply(
+        xfs_buf_t               *pb)
+{
+        int                     i, rw, map_i, total_nr_pages, nr_pages;
+        struct bio              *bio;
+        int                     offset = pb->pb_offset;
+        int                     size = pb->pb_count_desired;
+        sector_t                sector = pb->pb_bn;
+        unsigned int            blocksize = pb->pb_target->pbr_bsize;
+        int                     locking = _pagebuf_iolocked(pb);
+        total_nr_pages = pb->pb_page_count;
+        map_i = 0;
+        if (pb->pb_flags & _PBF_RUN_QUEUES) {
+                pb->pb_flags &= ~_PBF_RUN_QUEUES;
+                rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC;
+        } else {
+                rw = (pb->pb_flags & PBF_READ) ? READ : WRITE;
+        }
+        /* Special code path for reading a sub page size pagebuf in --
+         * we populate up the whole page, and hence the other metadata
+         * in the same page.  This optimization is only valid when the
+         * filesystem block size and the page size are equal.
+         */
+        if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
+            (pb->pb_flags & PBF_READ) && locking &&
+            (blocksize == PAGE_CACHE_SIZE)) {
+                bio = bio_alloc(GFP_NOIO, 1);
+                bio->bi_bdev = pb->pb_target->pbr_bdev;
+                bio->bi_sector = sector - (offset >> BBSHIFT);
+                bio->bi_end_io = bio_end_io_pagebuf;
+                bio->bi_private = pb;
+                bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0);
+                size = 0;
+                atomic_inc(&pb->pb_io_remaining);
+                goto submit_io;
+        }
+        /* Lock down the pages which we need to for the request */
+        if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) {
+                for (i = 0; size; i++) {
+                        int             nbytes = PAGE_CACHE_SIZE - offset;
+                        struct page     *page = pb->pb_pages[i];
+                        if (nbytes > size)
+                                nbytes = size;
+                        lock_page(page);
+                        size -= nbytes;
+                        offset = 0;
+                }
+                offset = pb->pb_offset;
+                size = pb->pb_count_desired;
+        }
+next_chunk:
+        atomic_inc(&pb->pb_io_remaining);
+        nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+        if (nr_pages > total_nr_pages)
+                nr_pages = total_nr_pages;
+        bio = bio_alloc(GFP_NOIO, nr_pages);
+        bio->bi_bdev = pb->pb_target->pbr_bdev;
+        bio->bi_sector = sector;
+        bio->bi_end_io = bio_end_io_pagebuf;
+        bio->bi_private = pb;
+        for (; size && nr_pages; nr_pages--, map_i++) {
+                int     nbytes = PAGE_CACHE_SIZE - offset;
+                if (nbytes > size)
+                        nbytes = size;
+                if (bio_add_page(bio, pb->pb_pages[map_i],
+                                        nbytes, offset) < nbytes)
+                        break;
+                offset = 0;
+                sector += nbytes >> BBSHIFT;
+                size -= nbytes;
+                total_nr_pages--;
+        }
+submit_io:
+        if (likely(bio->bi_size)) {
+                submit_bio(rw, bio);
+                if (size)
+                        goto next_chunk;
+        } else {
+                bio_put(bio);
+                pagebuf_ioerror(pb, EIO);
+        }
+}
+/*
+ *      pagebuf_iorequest -- the core I/O request routine.
+ */
+int
+pagebuf_iorequest(                      /* start real I/O               */
+        xfs_buf_t               *pb)    /* buffer to convey to device   */
+{
+        PB_TRACE(pb, "iorequest", 0);
+        if (pb->pb_flags & PBF_DELWRI) {
+                pagebuf_delwri_queue(pb, 1);
+                return 0;
+        }
+        if (pb->pb_flags & PBF_WRITE) {
+                _pagebuf_wait_unpin(pb);
+        }
+        pagebuf_hold(pb);
+        /* Set the count to 1 initially, this will stop an I/O
+         * completion callout which happens before we have started
+         * all the I/O from calling pagebuf_iodone too early.
+         */
+        atomic_set(&pb->pb_io_remaining, 1);
+        _pagebuf_ioapply(pb);
+        _pagebuf_iodone(pb, 0);
+        pagebuf_rele(pb);
+        return 0;
+}
+/*
+ *      pagebuf_iowait
+ *
+ *      pagebuf_iowait waits for I/O to complete on the buffer supplied.
+ *      It returns immediately if no I/O is pending.  In any case, it returns
+ *      the error code, if any, or 0 if there is no error.
+ */
+int
+pagebuf_iowait(
+        xfs_buf_t               *pb)
+{
+        PB_TRACE(pb, "iowait", 0);
+        if (atomic_read(&pb->pb_io_remaining))
+                blk_run_address_space(pb->pb_target->pbr_mapping);
+        down(&pb->pb_iodonesema);
+        PB_TRACE(pb, "iowaited", (long)pb->pb_error);
+        return pb->pb_error;
+}
+caddr_t
+pagebuf_offset(
+        xfs_buf_t               *pb,
+        size_t                  offset)
+{
+        struct page             *page;
+        offset += pb->pb_offset;
+        page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
+        return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
+}
+/*
+ *      pagebuf_iomove
+ *
+ *      Move data into or out of a buffer.
+ */
+void
+pagebuf_iomove(
+        xfs_buf_t               *pb,    /* buffer to process            */
+        size_t                  boff,   /* starting buffer offset       */
+        size_t                  bsize,  /* length to copy               */
+        caddr_t                 data,   /* data address                 */
+        page_buf_rw_t           mode)   /* read/write flag              */
+{
+        size_t                  bend, cpoff, csize;
+        struct page             *page;
+        bend = boff + bsize;
+        while (boff < bend) {
+                page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
+                cpoff = page_buf_poff(boff + pb->pb_offset);
+                csize = min_t(size_t,
+                              PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
+                ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
+                switch (mode) {
+                case PBRW_ZERO:
+                        memset(page_address(page) + cpoff, 0, csize);
+                        break;
+                case PBRW_READ:
+                        memcpy(data, page_address(page) + cpoff, csize);
+                        break;
+                case PBRW_WRITE:
+                        memcpy(page_address(page) + cpoff, data, csize);
+                }
+                boff += csize;
+                data += csize;
+        }
+}
+/*
+ *      Handling of buftargs.
+ */
+/*
+ * Wait for any bufs with callbacks that have been submitted but
+ * have not yet returned... walk the hash list for the target.
+ */
+void
+xfs_wait_buftarg(
+        xfs_buftarg_t   *btp)
+{
+        xfs_buf_t       *bp, *n;
+        xfs_bufhash_t   *hash;
+        uint            i;
+        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
+                hash = &btp->bt_hash[i];
+again:
+                spin_lock(&hash->bh_lock);
+                list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
+                        ASSERT(btp == bp->pb_target);
+                        if (!(bp->pb_flags & PBF_FS_MANAGED)) {
+                                spin_unlock(&hash->bh_lock);
+                                delay(100);
+                                goto again;
+                        }
+                }
+                spin_unlock(&hash->bh_lock);
+        }
+}
+/*
+ * Allocate buffer hash table for a given target.
+ * For devices containing metadata (i.e. not the log/realtime devices)
+ * we need to allocate a much larger hash table.
+ */
+STATIC void
+xfs_alloc_bufhash(
+        xfs_buftarg_t           *btp,
+        int                     external)
+{
+        unsigned int            i;
+        btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
+        btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
+        btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
+                                        sizeof(xfs_bufhash_t), KM_SLEEP);
+        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
+                spin_lock_init(&btp->bt_hash[i].bh_lock);
+                INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+        }
+}
+STATIC void
+xfs_free_bufhash(
+        xfs_buftarg_t           *btp)
+{
+        kmem_free(btp->bt_hash,
+                        (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
+        btp->bt_hash = NULL;
+}
+void
+xfs_free_buftarg(
+        xfs_buftarg_t           *btp,
+        int                     external)
+{
+        xfs_flush_buftarg(btp, 1);
+        if (external)
+                xfs_blkdev_put(btp->pbr_bdev);
+        xfs_free_bufhash(btp);
+        iput(btp->pbr_mapping->host);
+        kmem_free(btp, sizeof(*btp));
+}
+void
+xfs_incore_relse(
+        xfs_buftarg_t           *btp,
+        int                     delwri_only,
+        int                     wait)
+{
+        invalidate_bdev(btp->pbr_bdev, 1);
+        truncate_inode_pages(btp->pbr_mapping, 0LL);
+}
+STATIC int
+xfs_setsize_buftarg_flags(
+        xfs_buftarg_t           *btp,
+        unsigned int            blocksize,
+        unsigned int            sectorsize,
+        int                     verbose)
+{
+        btp->pbr_bsize = blocksize;
+        btp->pbr_sshift = ffs(sectorsize) - 1;
+        btp->pbr_smask = sectorsize - 1;
+        if (set_blocksize(btp->pbr_bdev, sectorsize)) {
+                printk(KERN_WARNING
+                        "XFS: Cannot set_blocksize to %u on device %s\n",
+                        sectorsize, XFS_BUFTARG_NAME(btp));
+                return EINVAL;
+        }
+        if (verbose &&
+            (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
+                printk(KERN_WARNING
+                        "XFS: %u byte sectors in use on device %s.  "
+                        "This is suboptimal; %u or greater is ideal.\n",
+                        sectorsize, XFS_BUFTARG_NAME(btp),
+                        (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
+        }
+        return 0;
+}
+/*
+* When allocating the initial buffer target we have not yet
+* read in the superblock, so don't know what sized sectors
+* are being used is at this early stage.  Play safe.
+*/
+STATIC int
+xfs_setsize_buftarg_early(
+        xfs_buftarg_t           *btp,
+        struct block_device     *bdev)
+{
+        return xfs_setsize_buftarg_flags(btp,
+                        PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+}
+int
+xfs_setsize_buftarg(
+        xfs_buftarg_t           *btp,
+        unsigned int            blocksize,
+        unsigned int            sectorsize)
+{
+        return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+}
+STATIC int
+xfs_mapping_buftarg(
+        xfs_buftarg_t           *btp,
+        struct block_device     *bdev)
+{
+        struct backing_dev_info *bdi;
+        struct inode            *inode;
+        struct address_space    *mapping;
+        static struct address_space_operations mapping_aops = {
+                .sync_page = block_sync_page,
+        };
+        inode = new_inode(bdev->bd_inode->i_sb);
+        if (!inode) {
+                printk(KERN_WARNING
+                        "XFS: Cannot allocate mapping inode for device %s\n",
+                        XFS_BUFTARG_NAME(btp));
+                return ENOMEM;
+        }
+        inode->i_mode = S_IFBLK;
+        inode->i_bdev = bdev;
+        inode->i_rdev = bdev->bd_dev;
+        bdi = blk_get_backing_dev_info(bdev);
+        if (!bdi)
+                bdi = &default_backing_dev_info;
+        mapping = &inode->i_data;
+        mapping->a_ops = &mapping_aops;
+        mapping->backing_dev_info = bdi;
+        mapping_set_gfp_mask(mapping, GFP_NOFS);
+        btp->pbr_mapping = mapping;
+        return 0;
+}
+xfs_buftarg_t *
+xfs_alloc_buftarg(
+        struct block_device     *bdev,
+        int                     external)
+{
+        xfs_buftarg_t           *btp;
+        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+        btp->pbr_dev =  bdev->bd_dev;
+        btp->pbr_bdev = bdev;
+        if (xfs_setsize_buftarg_early(btp, bdev))
+                goto error;
+        if (xfs_mapping_buftarg(btp, bdev))
+                goto error;
+        xfs_alloc_bufhash(btp, external);
+        return btp;
+error:
+        kmem_free(btp, sizeof(*btp));
+        return NULL;
+}
+/*
+ * Pagebuf delayed write buffer handling
+ */
+STATIC LIST_HEAD(pbd_delwrite_queue);
+STATIC DEFINE_SPINLOCK(pbd_delwrite_lock);
+STATIC void
+pagebuf_delwri_queue(
+        xfs_buf_t               *pb,
+        int                     unlock)
+{
+        PB_TRACE(pb, "delwri_q", (long)unlock);
+        ASSERT(pb->pb_flags & PBF_DELWRI);
+        spin_lock(&pbd_delwrite_lock);
+        /* If already in the queue, dequeue and place at tail */
+        if (!list_empty(&pb->pb_list)) {
+                if (unlock) {
+                        atomic_dec(&pb->pb_hold);
+                }
+                list_del(&pb->pb_list);
+        }
+        list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
+        pb->pb_queuetime = jiffies;
+        spin_unlock(&pbd_delwrite_lock);
+        if (unlock)
+                pagebuf_unlock(pb);
+}
+void
+pagebuf_delwri_dequeue(
+        xfs_buf_t               *pb)
+{
+        int                     dequeued = 0;
+        spin_lock(&pbd_delwrite_lock);
+        if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
+                list_del_init(&pb->pb_list);
+                dequeued = 1;
+        }
+        pb->pb_flags &= ~PBF_DELWRI;
+        spin_unlock(&pbd_delwrite_lock);
+        if (dequeued)
+                pagebuf_rele(pb);
+        PB_TRACE(pb, "delwri_dq", (long)dequeued);
+}
+STATIC void
+pagebuf_runall_queues(
+        struct workqueue_struct *queue)
+{
+        flush_workqueue(queue);
+}
+/* Defines for pagebuf daemon */
+STATIC DECLARE_COMPLETION(pagebuf_daemon_done);
+STATIC struct task_struct *pagebuf_daemon_task;
+STATIC int pagebuf_daemon_active;
+STATIC int force_flush;
+STATIC int
+pagebuf_daemon_wakeup(
+        int                     priority,
+        unsigned int            mask)
+{
+        force_flush = 1;
+        barrier();
+        wake_up_process(pagebuf_daemon_task);
+        return 0;
+}
+STATIC int
+pagebuf_daemon(
+        void                    *data)
+{
+        struct list_head        tmp;
+        unsigned long           age;
+        xfs_buftarg_t           *target;
+        xfs_buf_t               *pb, *n;
+        /*  Set up the thread  */
+        daemonize("xfsbufd");
+        current->flags |= PF_MEMALLOC;
+        pagebuf_daemon_task = current;
+        pagebuf_daemon_active = 1;
+        barrier();
+        INIT_LIST_HEAD(&tmp);
+        do {
+                try_to_freeze(PF_FREEZE);
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
+                age = (xfs_buf_age_centisecs * HZ) / 100;
+                spin_lock(&pbd_delwrite_lock);
+                list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
+                        PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
+                        ASSERT(pb->pb_flags & PBF_DELWRI);
+                        if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
+                                if (!force_flush &&
+                                    time_before(jiffies,
+                                                pb->pb_queuetime + age)) {
+                                        pagebuf_unlock(pb);
+                                        break;
+                                }
+                                pb->pb_flags &= ~PBF_DELWRI;
+                                pb->pb_flags |= PBF_WRITE;
+                                list_move(&pb->pb_list, &tmp);
+                        }
+                }
+                spin_unlock(&pbd_delwrite_lock);
+                while (!list_empty(&tmp)) {
+                        pb = list_entry(tmp.next, xfs_buf_t, pb_list);
+                        target = pb->pb_target;
+                        list_del_init(&pb->pb_list);
+                        pagebuf_iostrategy(pb);
+                        blk_run_address_space(target->pbr_mapping);
+                }
+                if (as_list_len > 0)
+                        purge_addresses();
+                force_flush = 0;
+        } while (pagebuf_daemon_active);
+        complete_and_exit(&pagebuf_daemon_done, 0);
+}
+/*
+ * Go through all incore buffers, and release buffers if they belong to
+ * the given device. This is used in filesystem error handling to
+ * preserve the consistency of its metadata.
+ */
+int
+xfs_flush_buftarg(
+        xfs_buftarg_t           *target,
+        int                     wait)
+{
+        struct list_head        tmp;
+        xfs_buf_t               *pb, *n;
+        int                     pincount = 0;
+        pagebuf_runall_queues(pagebuf_dataio_workqueue);
+        pagebuf_runall_queues(pagebuf_logio_workqueue);
+        INIT_LIST_HEAD(&tmp);
+        spin_lock(&pbd_delwrite_lock);
+        list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
+                if (pb->pb_target != target)
+                        continue;
+                ASSERT(pb->pb_flags & PBF_DELWRI);
+                PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
+                if (pagebuf_ispin(pb)) {
+                        pincount++;
+                        continue;
+                }
+                pb->pb_flags &= ~PBF_DELWRI;
+                pb->pb_flags |= PBF_WRITE;
+                list_move(&pb->pb_list, &tmp);
+        }
+        spin_unlock(&pbd_delwrite_lock);
+        /*
+         * Dropped the delayed write list lock, now walk the temporary list
+         */
+        list_for_each_entry_safe(pb, n, &tmp, pb_list) {
+                if (wait)
+                        pb->pb_flags &= ~PBF_ASYNC;
+                else
+                        list_del_init(&pb->pb_list);
+                pagebuf_lock(pb);
+                pagebuf_iostrategy(pb);
+        }
+        /*
+         * Remaining list items must be flushed before returning
+         */
+        while (!list_empty(&tmp)) {
+                pb = list_entry(tmp.next, xfs_buf_t, pb_list);
+                list_del_init(&pb->pb_list);
+                xfs_iowait(pb);
+                xfs_buf_relse(pb);
+        }
+        if (wait)
+                blk_run_address_space(target->pbr_mapping);
+        return pincount;
+}
+STATIC int
+pagebuf_daemon_start(void)
+{
+        int             rval;
+        pagebuf_logio_workqueue = create_workqueue("xfslogd");
+        if (!pagebuf_logio_workqueue)
+                return -ENOMEM;
+        pagebuf_dataio_workqueue = create_workqueue("xfsdatad");
+        if (!pagebuf_dataio_workqueue) {
+                destroy_workqueue(pagebuf_logio_workqueue);
+                return -ENOMEM;
+        }
+        rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES);
+        if (rval < 0) {
+                destroy_workqueue(pagebuf_logio_workqueue);
+                destroy_workqueue(pagebuf_dataio_workqueue);
+        }
+        return rval;
+}
+/*
+ * pagebuf_daemon_stop
+ *
+ * Note: do not mark as __exit, it is called from pagebuf_terminate.
+ */
+STATIC void
+pagebuf_daemon_stop(void)
+{
+        pagebuf_daemon_active = 0;
+        barrier();
+        wait_for_completion(&pagebuf_daemon_done);
+        destroy_workqueue(pagebuf_logio_workqueue);
+        destroy_workqueue(pagebuf_dataio_workqueue);
+}
+/*
+ *      Initialization and Termination
+ */
+int __init
+pagebuf_init(void)
+{
+        pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0,
+                        SLAB_HWCACHE_ALIGN, NULL, NULL);
+        if (pagebuf_cache == NULL) {
+                printk("XFS: couldn't init xfs_buf_t cache\n");
+                pagebuf_terminate();
+                return -ENOMEM;
+        }
+#ifdef PAGEBUF_TRACE
+        pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
+#endif
+        pagebuf_daemon_start();
+        pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup);
+        if (pagebuf_shake == NULL) {
+                pagebuf_terminate();
+                return -ENOMEM;
+        }
+        return 0;
+}
+/*
+ *      pagebuf_terminate.
+ *
+ *      Note: do not mark as __exit, this is also called from the __init code.
+ */
+void
+pagebuf_terminate(void)
+{
+        pagebuf_daemon_stop();
+#ifdef PAGEBUF_TRACE
+        ktrace_free(pagebuf_trace_buf);
+#endif
+        kmem_zone_destroy(pagebuf_cache);
+        kmem_shake_deregister(pagebuf_shake);
+}
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
new file mode 100644
index 000000000000..74deed8e6d90
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+/*
+ *      Base types
+ */
+#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
+#define page_buf_ctob(pp)       ((pp) * PAGE_CACHE_SIZE)
+#define page_buf_btoc(dd)       (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
+#define page_buf_btoct(dd)      ((dd) >> PAGE_CACHE_SHIFT)
+#define page_buf_poff(aa)       ((aa) & ~PAGE_CACHE_MASK)
+typedef enum page_buf_rw_e {
+        PBRW_READ = 1,                  /* transfer into target memory */
+        PBRW_WRITE = 2,                 /* transfer from target memory */
+        PBRW_ZERO = 3                   /* Zero target memory */
+} page_buf_rw_t;
+typedef enum page_buf_flags_e {         /* pb_flags values */
+        PBF_READ = (1 << 0),    /* buffer intended for reading from device */
+        PBF_WRITE = (1 << 1),   /* buffer intended for writing to device   */
+        PBF_MAPPED = (1 << 2),  /* buffer mapped (pb_addr valid)           */
+        PBF_PARTIAL = (1 << 3), /* buffer partially read                   */
+        PBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
+        PBF_NONE = (1 << 5),    /* buffer not read at all                  */
+        PBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
+        PBF_STALE = (1 << 7),   /* buffer has been staled, do not find it  */
+        PBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
+        PBF_FS_DATAIOD = (1 << 9),  /* schedule IO completion on fs datad  */
+        PBF_FORCEIO = (1 << 10),    /* ignore any cache state              */
+        PBF_FLUSH = (1 << 11),      /* flush disk write cache              */
+        PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead             */
+        /* flags used only as arguments to access routines */
+        PBF_LOCK = (1 << 14),       /* lock requested                      */
+        PBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait     */
+        PBF_DONT_BLOCK = (1 << 16), /* do not block in current thread      */
+        /* flags used only internally */
+        _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache                 */
+        _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc()              */
+        _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue         */
+} page_buf_flags_t;
+#define PBF_UPDATE (PBF_READ | PBF_WRITE)
+#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
+#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
+typedef struct xfs_bufhash {
+        struct list_head        bh_list;
+        spinlock_t              bh_lock;
+} xfs_bufhash_t;
+typedef struct xfs_buftarg {
+        dev_t                   pbr_dev;
+        struct block_device     *pbr_bdev;
+        struct address_space    *pbr_mapping;
+        unsigned int            pbr_bsize;
+        unsigned int            pbr_sshift;
+        size_t                  pbr_smask;
+        /* per-device buffer hash table */
+        uint                    bt_hashmask;
+        uint                    bt_hashshift;
+        xfs_bufhash_t           *bt_hash;
+} xfs_buftarg_t;
+/*
+ *      xfs_buf_t:  Buffer structure for page cache-based buffers
+ *
+ * This buffer structure is used by the page cache buffer management routines
+ * to refer to an assembly of pages forming a logical buffer.  The actual I/O
+ * is performed with buffer_head structures, as required by drivers.
+ * 
+ * The buffer structure is used on temporary basis only, and discarded when
+ * released.  The real data storage is recorded in the page cache.  Metadata is
+ * hashed to the block device on which the file system resides.
+ */
+struct xfs_buf;
+/* call-back function on I/O completion */
+typedef void (*page_buf_iodone_t)(struct xfs_buf *);
+/* call-back function on I/O completion */
+typedef void (*page_buf_relse_t)(struct xfs_buf *);
+/* pre-write function */
+typedef int (*page_buf_bdstrat_t)(struct xfs_buf *);
+#define PB_PAGES        2
+typedef struct xfs_buf {
+        struct semaphore        pb_sema;        /* semaphore for lockables  */
+        unsigned long           pb_queuetime;   /* time buffer was queued   */
+        atomic_t                pb_pin_count;   /* pin count                */
+        wait_queue_head_t       pb_waiters;     /* unpin waiters            */
+        struct list_head        pb_list;
+        page_buf_flags_t        pb_flags;       /* status flags */
+        struct list_head        pb_hash_list;   /* hash table list */
+        xfs_bufhash_t           *pb_hash;       /* hash table list start */
+        xfs_buftarg_t           *pb_target;     /* buffer target (device) */
+        atomic_t                pb_hold;        /* reference count */
+        xfs_daddr_t             pb_bn;          /* block number for I/O */
+        loff_t                  pb_file_offset; /* offset in file */
+        size_t                  pb_buffer_length; /* size of buffer in bytes */
+        size_t                  pb_count_desired; /* desired transfer size */
+        void                    *pb_addr;       /* virtual address of buffer */
+        struct work_struct      pb_iodone_work;
+        atomic_t                pb_io_remaining;/* #outstanding I/O requests */
+        page_buf_iodone_t       pb_iodone;      /* I/O completion function */
+        page_buf_relse_t        pb_relse;       /* releasing function */
+        page_buf_bdstrat_t      pb_strat;       /* pre-write function */
+        struct semaphore        pb_iodonesema;  /* Semaphore for I/O waiters */
+        void                    *pb_fspriv;
+        void                    *pb_fspriv2;
+        void                    *pb_fspriv3;
+        unsigned short          pb_error;       /* error code on I/O */
+        unsigned short          pb_locked;      /* page array is locked */
+        unsigned int            pb_page_count;  /* size of page array */
+        unsigned int            pb_offset;      /* page offset in first page */
+        struct page             **pb_pages;     /* array of page pointers */
+        struct page             *pb_page_array[PB_PAGES]; /* inline pages */
+#ifdef PAGEBUF_LOCK_TRACKING
+        int                     pb_last_holder;
+#endif
+} xfs_buf_t;
+/* Finding and Reading Buffers */
+extern xfs_buf_t *_pagebuf_find(        /* find buffer for block if     */
+                                        /* the block is in memory       */
+                xfs_buftarg_t *,        /* inode for block              */
+                loff_t,                 /* starting offset of range     */
+                size_t,                 /* length of range              */
+                page_buf_flags_t,       /* PBF_LOCK                     */
+                xfs_buf_t *);           /* newly allocated buffer       */
+#define xfs_incore(buftarg,blkno,len,lockit) \
+        _pagebuf_find(buftarg, blkno ,len, lockit, NULL)
+extern xfs_buf_t *xfs_buf_get_flags(    /* allocate a buffer            */
+                xfs_buftarg_t *,        /* inode for buffer             */
+                loff_t,                 /* starting offset of range     */
+                size_t,                 /* length of range              */
+                page_buf_flags_t);      /* PBF_LOCK, PBF_READ,          */
+                                        /* PBF_ASYNC                    */
+#define xfs_buf_get(target, blkno, len, flags) \
+        xfs_buf_get_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
+extern xfs_buf_t *xfs_buf_read_flags(   /* allocate and read a buffer   */
+                xfs_buftarg_t *,        /* inode for buffer             */
+                loff_t,                 /* starting offset of range     */
+                size_t,                 /* length of range              */
+                page_buf_flags_t);      /* PBF_LOCK, PBF_ASYNC          */
+#define xfs_buf_read(target, blkno, len, flags) \
+        xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
+extern xfs_buf_t *pagebuf_lookup(
+                xfs_buftarg_t *,
+                loff_t,                 /* starting offset of range     */
+                size_t,                 /* length of range              */
+                page_buf_flags_t);      /* PBF_READ, PBF_WRITE,         */
+                                        /* PBF_FORCEIO,                 */
+extern xfs_buf_t *pagebuf_get_empty(    /* allocate pagebuf struct with */
+                                        /*  no memory or disk address   */
+                size_t len,
+                xfs_buftarg_t *);       /* mount point "fake" inode     */
+extern xfs_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct       */
+                                        /* without disk address         */
+                size_t len,
+                xfs_buftarg_t *);       /* mount point "fake" inode     */
+extern int pagebuf_associate_memory(
+                xfs_buf_t *,
+                void *,
+                size_t);
+extern void pagebuf_hold(               /* increment reference count    */
+                xfs_buf_t *);           /* buffer to hold               */
+extern void pagebuf_readahead(          /* read ahead into cache        */
+                xfs_buftarg_t  *,       /* target for buffer (or NULL)  */
+                loff_t,                 /* starting offset of range     */
+                size_t,                 /* length of range              */
+                page_buf_flags_t);      /* additional read flags        */
+/* Releasing Buffers */
+extern void pagebuf_free(               /* deallocate a buffer          */
+                xfs_buf_t *);           /* buffer to deallocate         */
+extern void pagebuf_rele(               /* release hold on a buffer     */
+                xfs_buf_t *);           /* buffer to release            */
+/* Locking and Unlocking Buffers */
+extern int pagebuf_cond_lock(           /* lock buffer, if not locked   */
+                                        /* (returns -EBUSY if locked)   */
+                xfs_buf_t *);           /* buffer to lock               */
+extern int pagebuf_lock_value(          /* return count on lock         */
+                xfs_buf_t *);          /* buffer to check              */
+extern int pagebuf_lock(                /* lock buffer                  */
+                xfs_buf_t *);          /* buffer to lock               */
+extern void pagebuf_unlock(             /* unlock buffer                */
+                xfs_buf_t *);           /* buffer to unlock             */
+/* Buffer Read and Write Routines */
+extern void pagebuf_iodone(             /* mark buffer I/O complete     */
+                xfs_buf_t *,            /* buffer to mark               */
+                int,                    /* use data/log helper thread.  */
+                int);                   /* run completion locally, or in
+                                         * a helper thread.             */
+extern void pagebuf_ioerror(            /* mark buffer in error (or not) */
+                xfs_buf_t *,            /* buffer to mark               */
+                int);                   /* error to store (0 if none)   */
+extern int pagebuf_iostart(             /* start I/O on a buffer        */
+                xfs_buf_t *,            /* buffer to start              */
+                page_buf_flags_t);      /* PBF_LOCK, PBF_ASYNC,         */
+                                        /* PBF_READ, PBF_WRITE,         */
+                                        /* PBF_DELWRI                   */
+extern int pagebuf_iorequest(           /* start real I/O               */
+                xfs_buf_t *);           /* buffer to convey to device   */
+extern int pagebuf_iowait(              /* wait for buffer I/O done     */
+                xfs_buf_t *);           /* buffer to wait on            */
+extern void pagebuf_iomove(             /* move data in/out of pagebuf  */
+                xfs_buf_t *,            /* buffer to manipulate         */
+                size_t,                 /* starting buffer offset       */
+                size_t,                 /* length in buffer             */
+                caddr_t,                /* data pointer                 */
+                page_buf_rw_t);         /* direction                    */
+static inline int pagebuf_iostrategy(xfs_buf_t *pb)
+{
+        return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb);
+}
+static inline int pagebuf_geterror(xfs_buf_t *pb)
+{
+        return pb ? pb->pb_error : ENOMEM;
+}
+/* Buffer Utility Routines */
+extern caddr_t pagebuf_offset(          /* pointer at offset in buffer  */
+                xfs_buf_t *,            /* buffer to offset into        */
+                size_t);                /* offset                       */
+/* Pinning Buffer Storage in Memory */
+extern void pagebuf_pin(                /* pin buffer in memory         */
+                xfs_buf_t *);           /* buffer to pin                */
+extern void pagebuf_unpin(              /* unpin buffered data          */
+                xfs_buf_t *);           /* buffer to unpin              */
+extern int pagebuf_ispin(               /* check if buffer is pinned    */
+                xfs_buf_t *);           /* buffer to check              */
+/* Delayed Write Buffer Routines */
+extern void pagebuf_delwri_dequeue(xfs_buf_t *);
+/* Buffer Daemon Setup Routines */
+extern int pagebuf_init(void);
+extern void pagebuf_terminate(void);
+#ifdef PAGEBUF_TRACE
+extern ktrace_t *pagebuf_trace_buf;
+extern void pagebuf_trace(
+                xfs_buf_t *,            /* buffer being traced          */
+                char *,                 /* description of operation     */
+                void *,                 /* arbitrary diagnostic value   */
+                void *);                /* return address               */
+#else
+# define pagebuf_trace(pb, id, ptr, ra) do { } while (0)
+#endif
+#define pagebuf_target_name(target)     \
+        ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; })
+/* These are just for xfs_syncsub... it sets an internal variable
+ * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
+ */
+#define XFS_B_ASYNC             PBF_ASYNC
+#define XFS_B_DELWRI            PBF_DELWRI
+#define XFS_B_READ              PBF_READ
+#define XFS_B_WRITE             PBF_WRITE
+#define XFS_B_STALE             PBF_STALE
+#define XFS_BUF_TRYLOCK         PBF_TRYLOCK
+#define XFS_INCORE_TRYLOCK      PBF_TRYLOCK
+#define XFS_BUF_LOCK            PBF_LOCK
+#define XFS_BUF_MAPPED          PBF_MAPPED
+#define BUF_BUSY                PBF_DONT_BLOCK
+#define XFS_BUF_BFLAGS(x)       ((x)->pb_flags)
+#define XFS_BUF_ZEROFLAGS(x)    \
+        ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI))
+#define XFS_BUF_STALE(x)        ((x)->pb_flags |= XFS_B_STALE)
+#define XFS_BUF_UNSTALE(x)      ((x)->pb_flags &= ~XFS_B_STALE)
+#define XFS_BUF_ISSTALE(x)      ((x)->pb_flags & XFS_B_STALE)
+#define XFS_BUF_SUPER_STALE(x)  do {                            \
+                                        XFS_BUF_STALE(x);       \
+                                        pagebuf_delwri_dequeue(x);      \
+                                        XFS_BUF_DONE(x);        \
+                                } while (0)
+#define XFS_BUF_MANAGE          PBF_FS_MANAGED
+#define XFS_BUF_UNMANAGE(x)     ((x)->pb_flags &= ~PBF_FS_MANAGED)
+#define XFS_BUF_DELAYWRITE(x)    ((x)->pb_flags |= PBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(x)  pagebuf_delwri_dequeue(x)
+#define XFS_BUF_ISDELAYWRITE(x)  ((x)->pb_flags & PBF_DELWRI)
+#define XFS_BUF_ERROR(x,no)      pagebuf_ioerror(x,no)
+#define XFS_BUF_GETERROR(x)      pagebuf_geterror(x)
+#define XFS_BUF_ISERROR(x)       (pagebuf_geterror(x)?1:0)
+#define XFS_BUF_DONE(x)          ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
+#define XFS_BUF_UNDONE(x)        ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
+#define XFS_BUF_ISDONE(x)        (!(PBF_NOT_DONE(x)))
+#define XFS_BUF_BUSY(x)          ((x)->pb_flags |= PBF_FORCEIO)
+#define XFS_BUF_UNBUSY(x)        ((x)->pb_flags &= ~PBF_FORCEIO)
+#define XFS_BUF_ISBUSY(x)        (1)
+#define XFS_BUF_ASYNC(x)         ((x)->pb_flags |= PBF_ASYNC)
+#define XFS_BUF_UNASYNC(x)       ((x)->pb_flags &= ~PBF_ASYNC)
+#define XFS_BUF_ISASYNC(x)       ((x)->pb_flags & PBF_ASYNC)
+#define XFS_BUF_FLUSH(x)         ((x)->pb_flags |= PBF_FLUSH)
+#define XFS_BUF_UNFLUSH(x)       ((x)->pb_flags &= ~PBF_FLUSH)
+#define XFS_BUF_ISFLUSH(x)       ((x)->pb_flags & PBF_FLUSH)
+#define XFS_BUF_SHUT(x)          printk("XFS_BUF_SHUT not implemented yet\n")
+#define XFS_BUF_UNSHUT(x)        printk("XFS_BUF_UNSHUT not implemented yet\n")
+#define XFS_BUF_ISSHUT(x)        (0)
+#define XFS_BUF_HOLD(x)         pagebuf_hold(x)
+#define XFS_BUF_READ(x)         ((x)->pb_flags |= PBF_READ)
+#define XFS_BUF_UNREAD(x)       ((x)->pb_flags &= ~PBF_READ)
+#define XFS_BUF_ISREAD(x)       ((x)->pb_flags & PBF_READ)
+#define XFS_BUF_WRITE(x)        ((x)->pb_flags |= PBF_WRITE)
+#define XFS_BUF_UNWRITE(x)      ((x)->pb_flags &= ~PBF_WRITE)
+#define XFS_BUF_ISWRITE(x)      ((x)->pb_flags & PBF_WRITE)
+#define XFS_BUF_ISUNINITIAL(x)   (0)
+#define XFS_BUF_UNUNINITIAL(x)   (0)
+#define XFS_BUF_BP_ISMAPPED(bp)  1
+#define XFS_BUF_DATAIO(x)       ((x)->pb_flags |= PBF_FS_DATAIOD)
+#define XFS_BUF_UNDATAIO(x)     ((x)->pb_flags &= ~PBF_FS_DATAIOD)
+#define XFS_BUF_IODONE_FUNC(buf)        (buf)->pb_iodone
+#define XFS_BUF_SET_IODONE_FUNC(buf, func)      \
+                        (buf)->pb_iodone = (func)
+#define XFS_BUF_CLR_IODONE_FUNC(buf)            \
+                        (buf)->pb_iodone = NULL
+#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func)     \
+                        (buf)->pb_strat = (func)
+#define XFS_BUF_CLR_BDSTRAT_FUNC(buf)           \
+                        (buf)->pb_strat = NULL
+#define XFS_BUF_FSPRIVATE(buf, type)            \
+                        ((type)(buf)->pb_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(buf, value)       \
+                        (buf)->pb_fspriv = (void *)(value)
+#define XFS_BUF_FSPRIVATE2(buf, type)           \
+                        ((type)(buf)->pb_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(buf, value)      \
+                        (buf)->pb_fspriv2 = (void *)(value)
+#define XFS_BUF_FSPRIVATE3(buf, type)           \
+                        ((type)(buf)->pb_fspriv3)
+#define XFS_BUF_SET_FSPRIVATE3(buf, value)      \
+                        (buf)->pb_fspriv3  = (void *)(value)
+#define XFS_BUF_SET_START(buf)
+#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
+                        (buf)->pb_relse = (value)
+#define XFS_BUF_PTR(bp)         (xfs_caddr_t)((bp)->pb_addr)
+extern inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
+{
+        if (bp->pb_flags & PBF_MAPPED)
+                return XFS_BUF_PTR(bp) + offset;
+        return (xfs_caddr_t) pagebuf_offset(bp, offset);
+}
+#define XFS_BUF_SET_PTR(bp, val, count)         \
+                                pagebuf_associate_memory(bp, val, count)
+#define XFS_BUF_ADDR(bp)        ((bp)->pb_bn)
+#define XFS_BUF_SET_ADDR(bp, blk)               \
+                        ((bp)->pb_bn = (xfs_daddr_t)(blk))
+#define XFS_BUF_OFFSET(bp)      ((bp)->pb_file_offset)
+#define XFS_BUF_SET_OFFSET(bp, off)             \
+                        ((bp)->pb_file_offset = (off))
+#define XFS_BUF_COUNT(bp)       ((bp)->pb_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt)              \
+                        ((bp)->pb_count_desired = (cnt))
+#define XFS_BUF_SIZE(bp)        ((bp)->pb_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt)               \
+                        ((bp)->pb_buffer_length = (cnt))
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+#define XFS_BUF_ISPINNED(bp)    pagebuf_ispin(bp)
+#define XFS_BUF_VALUSEMA(bp)    pagebuf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp)      (pagebuf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp)       pagebuf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x)     pagebuf_lock(bp)
+#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
+/* setup the buffer target from a buftarg structure */
+#define XFS_BUF_SET_TARGET(bp, target)  \
+                (bp)->pb_target = (target)
+#define XFS_BUF_TARGET(bp)      ((bp)->pb_target)
+#define XFS_BUFTARG_NAME(target)        \
+                pagebuf_target_name(target)
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+static inline int       xfs_bawrite(void *mp, xfs_buf_t *bp)
+{
+        bp->pb_fspriv3 = mp;
+        bp->pb_strat = xfs_bdstrat_cb;
+        pagebuf_delwri_dequeue(bp);
+        return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | _PBF_RUN_QUEUES);
+}
+static inline void      xfs_buf_relse(xfs_buf_t *bp)
+{
+        if (!bp->pb_relse)
+                pagebuf_unlock(bp);
+        pagebuf_rele(bp);
+}
+#define xfs_bpin(bp)            pagebuf_pin(bp)
+#define xfs_bunpin(bp)          pagebuf_unpin(bp)
+#define xfs_buftrace(id, bp)    \
+            pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
+#define xfs_biodone(pb)             \
+            pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), 0)
+#define xfs_biomove(pb, off, len, data, rw) \
+            pagebuf_iomove((pb), (off), (len), (data), \
+                ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
+#define xfs_biozero(pb, off, len) \
+            pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
+static inline int       XFS_bwrite(xfs_buf_t *pb)
+{
+        int     iowait = (pb->pb_flags & PBF_ASYNC) == 0;
+        int     error = 0;
+        if (!iowait)
+                pb->pb_flags |= _PBF_RUN_QUEUES;
+        pagebuf_delwri_dequeue(pb);
+        pagebuf_iostrategy(pb);
+        if (iowait) {
+                error = pagebuf_iowait(pb);
+                xfs_buf_relse(pb);
+        }
+        return error;
+}
+#define XFS_bdwrite(pb)              \
+            pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
+static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
+{
+        bp->pb_strat = xfs_bdstrat_cb;
+        bp->pb_fspriv3 = mp;
+        return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
+}
+#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
+#define xfs_iowait(pb)  pagebuf_iowait(pb)
+#define xfs_baread(target, rablkno, ralen)  \
+        pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK)
+#define xfs_buf_get_empty(len, target)  pagebuf_get_empty((len), (target))
+#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target))
+#define xfs_buf_free(bp)                pagebuf_free(bp)
+/*
+ *      Handling of buftargs.
+ */
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
+extern void xfs_free_buftarg(xfs_buftarg_t *, int);
+extern void xfs_wait_buftarg(xfs_buftarg_t *);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern void xfs_incore_relse(xfs_buftarg_t *, int, int);
+extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+#define xfs_getsize_buftarg(buftarg) \
+        block_size((buftarg)->pbr_bdev)
+#define xfs_readonly_buftarg(buftarg) \
+        bdev_read_only((buftarg)->pbr_bdev)
+#define xfs_binval(buftarg) \
+        xfs_flush_buftarg(buftarg, 1)
+#define XFS_bflush(buftarg) \
+        xfs_flush_buftarg(buftarg, 1)
+#endif  /* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
new file mode 100644
index 000000000000..00c45849d41a
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CRED_H__
+#define __XFS_CRED_H__
+/*
+ * Credentials
+ */
+typedef struct cred {
+        /* EMPTY */
+} cred_t;
+extern struct cred *sys_cred;
+/* this is a hack.. (assums sys_cred is the only cred_t in the system) */
+static __inline int capable_cred(cred_t *cr, int cid)
+{
+        return (cr == sys_cred) ? 1 : capable(cid);
+}
+#endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
new file mode 100644
index 000000000000..f372a1a5e168
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_dmapi.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_mount.h"
+#include "xfs_export.h"
+/*
+ * XFS encode and decodes the fileid portion of NFS filehandles
+ * itself instead of letting the generic NFS code do it.  This
+ * allows filesystems with 64 bit inode numbers to be exported.
+ *
+ * Note that a side effect is that xfs_vget() won't be passed a
+ * zero inode/generation pair under normal circumstances.  As
+ * however a malicious client could send us such data, the check
+ * remains in that code.
+ */
+STATIC struct dentry *
+linvfs_decode_fh(
+        struct super_block      *sb,
+        __u32                   *fh,
+        int                     fh_len,
+        int                     fileid_type,
+        int (*acceptable)(
+                void            *context,
+                struct dentry   *de),
+        void                    *context)
+{
+        xfs_fid2_t              ifid;
+        xfs_fid2_t              pfid;
+        void                    *parent = NULL;
+        int                     is64 = 0;
+        __u32                   *p = fh;
+#if XFS_BIG_INUMS
+        is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG);
+        fileid_type &= ~XFS_FILEID_TYPE_64FLAG;
+#endif
+        /*
+         * Note that we only accept fileids which are long enough
+         * rather than allow the parent generation number to default
+         * to zero.  XFS considers zero a valid generation number not
+         * an invalid/wildcard value.  There's little point printk'ing
+         * a warning here as we don't have the client information
+         * which would make such a warning useful.
+         */
+        if (fileid_type > 2 ||
+            fh_len < xfs_fileid_length((fileid_type == 2), is64))
+                return NULL;
+        p = xfs_fileid_decode_fid2(p, &ifid, is64);
+        if (fileid_type == 2) {
+                p = xfs_fileid_decode_fid2(p, &pfid, is64);
+                parent = &pfid;
+        }
+        
+        fh = (__u32 *)&ifid;
+        return find_exported_dentry(sb, fh, parent, acceptable, context);
+}
+STATIC int
+linvfs_encode_fh(
+        struct dentry           *dentry,
+        __u32                   *fh,
+        int                     *max_len,
+        int                     connectable)
+{
+        struct inode            *inode = dentry->d_inode;
+        int                     type = 1;
+        __u32                   *p = fh;
+        int                     len;
+        int                     is64 = 0;
+#if XFS_BIG_INUMS
+        vfs_t                   *vfs = LINVFS_GET_VFS(inode->i_sb);
+        xfs_mount_t             *mp = XFS_VFSTOM(vfs);
+        
+        if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT)) {
+                /* filesystem may contain 64bit inode numbers */
+                is64 = XFS_FILEID_TYPE_64FLAG;
+        }
+#endif
+        /* Directories don't need their parent encoded, they have ".." */
+        if (S_ISDIR(inode->i_mode))
+            connectable = 0;
+        /*
+         * Only encode if there is enough space given.  In practice
+         * this means we can't export a filesystem with 64bit inodes
+         * over NFSv2 with the subtree_check export option; the other
+         * seven combinations work.  The real answer is "don't use v2".
+         */
+        len = xfs_fileid_length(connectable, is64);
+        if (*max_len < len)
+                return 255;
+        *max_len = len;
+        p = xfs_fileid_encode_inode(p, inode, is64);
+        if (connectable) {
+                spin_lock(&dentry->d_lock);
+                p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64);
+                spin_unlock(&dentry->d_lock);
+                type = 2;
+        }
+        BUG_ON((p - fh) != len);
+        return type | is64;
+}
+STATIC struct dentry *
+linvfs_get_dentry(
+        struct super_block      *sb,
+        void                    *data)
+{
+        vnode_t                 *vp;
+        struct inode            *inode;
+        struct dentry           *result;
+        vfs_t                   *vfsp = LINVFS_GET_VFS(sb);
+        int                     error;
+        VFS_VGET(vfsp, &vp, (fid_t *)data, error);
+        if (error || vp == NULL)
+                return ERR_PTR(-ESTALE) ;
+        inode = LINVFS_GET_IP(vp);
+        result = d_alloc_anon(inode);
+        if (!result) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return result;
+}
+STATIC struct dentry *
+linvfs_get_parent(
+        struct dentry           *child)
+{
+        int                     error;
+        vnode_t                 *vp, *cvp;
+        struct dentry           *parent;
+        struct dentry           dotdot;
+        dotdot.d_name.name = "..";
+        dotdot.d_name.len = 2;
+        dotdot.d_inode = NULL;
+        cvp = NULL;
+        vp = LINVFS_GET_VP(child->d_inode);
+        VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
+        if (unlikely(error))
+                return ERR_PTR(-error);
+        parent = d_alloc_anon(LINVFS_GET_IP(cvp));
+        if (unlikely(!parent)) {
+                VN_RELE(cvp);
+                return ERR_PTR(-ENOMEM);
+        }
+        return parent;
+}
+struct export_operations linvfs_export_ops = {
+        .decode_fh              = linvfs_decode_fh,
+        .encode_fh              = linvfs_encode_fh,
+        .get_parent             = linvfs_get_parent,
+        .get_dentry             = linvfs_get_dentry,
+};
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h
new file mode 100644
index 000000000000..60b2abac1c18
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_EXPORT_H__
+#define __XFS_EXPORT_H__
+/*
+ * Common defines for code related to exporting XFS filesystems over NFS.
+ *
+ * The NFS fileid goes out on the wire as an array of
+ * 32bit unsigned ints in host order.  There are 5 possible
+ * formats.
+ *
+ * (1)  fileid_type=0x00
+ *      (no fileid data; handled by the generic code)
+ *
+ * (2)  fileid_type=0x01
+ *      inode-num
+ *      generation
+ *
+ * (3)  fileid_type=0x02
+ *      inode-num
+ *      generation
+ *      parent-inode-num
+ *      parent-generation
+ *
+ * (4)  fileid_type=0x81
+ *      inode-num-lo32
+ *      inode-num-hi32
+ *      generation
+ *
+ * (5)  fileid_type=0x82
+ *      inode-num-lo32
+ *      inode-num-hi32
+ *      generation
+ *      parent-inode-num-lo32
+ *      parent-inode-num-hi32
+ *      parent-generation
+ *
+ * Note, the NFS filehandle also includes an fsid portion which
+ * may have an inode number in it.  That number is hardcoded to
+ * 32bits and there is no way for XFS to intercept it.  In
+ * practice this means when exporting an XFS filesytem with 64bit
+ * inodes you should either export the mountpoint (rather than
+ * a subdirectory) or use the "fsid" export option.
+ */
+/* This flag goes on the wire.  Don't play with it. */
+#define XFS_FILEID_TYPE_64FLAG  0x80    /* NFS fileid has 64bit inodes */
+/* Calculate the length in u32 units of the fileid data */
+static inline int
+xfs_fileid_length(int hasparent, int is64)
+{
+        return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2);
+}
+/*
+ * Decode encoded inode information (either for the inode itself
+ * or the parent) into an xfs_fid2_t structure.  Advances and
+ * returns the new data pointer
+ */
+static inline __u32 *
+xfs_fileid_decode_fid2(__u32 *p, xfs_fid2_t *fid, int is64)
+{
+        fid->fid_len = sizeof(xfs_fid2_t) - sizeof(fid->fid_len);
+        fid->fid_pad = 0;
+        fid->fid_ino = *p++;
+#if XFS_BIG_INUMS
+        if (is64)
+                fid->fid_ino |= (((__u64)(*p++)) << 32);
+#endif
+        fid->fid_gen = *p++;
+        return p;
+}
+/*
+ * Encode inode information (either for the inode itself or the
+ * parent) into a fileid buffer.  Advances and returns the new
+ * data pointer.
+ */
+static inline __u32 *
+xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64)
+{
+        *p++ = (__u32)inode->i_ino;
+#if XFS_BIG_INUMS
+        if (is64)
+                *p++ = (__u32)(inode->i_ino >> 32);
+#endif
+        *p++ = inode->i_generation;
+        return p;
+}
+#endif  /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
new file mode 100644
index 000000000000..9f057a4a5b06
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_trans.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_ioctl32.h"
+#include <linux/dcache.h>
+#include <linux/smp_lock.h>
+static struct vm_operations_struct linvfs_file_vm_ops;
+STATIC inline ssize_t
+__linvfs_read(
+        struct kiocb            *iocb,
+        char                    __user *buf,
+        int                     ioflags,
+        size_t                  count,
+        loff_t                  pos)
+{
+        struct iovec            iov = {buf, count};
+        struct file             *file = iocb->ki_filp;
+        vnode_t                 *vp = LINVFS_GET_VP(file->f_dentry->d_inode);
+        ssize_t                 rval;
+        BUG_ON(iocb->ki_pos != pos);
+        if (unlikely(file->f_flags & O_DIRECT))
+                ioflags |= IO_ISDIRECT;
+        VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
+        return rval;
+}
+STATIC ssize_t
+linvfs_aio_read(
+        struct kiocb            *iocb,
+        char                    __user *buf,
+        size_t                  count,
+        loff_t                  pos)
+{
+        return __linvfs_read(iocb, buf, IO_ISAIO, count, pos);
+}
+STATIC ssize_t
+linvfs_aio_read_invis(
+        struct kiocb            *iocb,
+        char                    __user *buf,
+        size_t                  count,
+        loff_t                  pos)
+{
+        return __linvfs_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
+}
+STATIC inline ssize_t
+__linvfs_write(
+        struct kiocb    *iocb,
+        const char      __user *buf,
+        int             ioflags,
+        size_t          count,
+        loff_t          pos)
+{
+        struct iovec    iov = {(void __user *)buf, count};
+        struct file     *file = iocb->ki_filp;
+        struct inode    *inode = file->f_mapping->host;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        ssize_t         rval;
+        BUG_ON(iocb->ki_pos != pos);
+        if (unlikely(file->f_flags & O_DIRECT))
+                ioflags |= IO_ISDIRECT;
+        VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, rval);
+        return rval;
+}
+STATIC ssize_t
+linvfs_aio_write(
+        struct kiocb            *iocb,
+        const char              __user *buf,
+        size_t                  count,
+        loff_t                  pos)
+{
+        return __linvfs_write(iocb, buf, IO_ISAIO, count, pos);
+}
+STATIC ssize_t
+linvfs_aio_write_invis(
+        struct kiocb            *iocb,
+        const char              __user *buf,
+        size_t                  count,
+        loff_t                  pos)
+{
+        return __linvfs_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
+}
+STATIC inline ssize_t
+__linvfs_readv(
+        struct file             *file,
+        const struct iovec      *iov,
+        int                     ioflags,
+        unsigned long           nr_segs,
+        loff_t                  *ppos)
+{
+        struct inode    *inode = file->f_mapping->host;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        struct          kiocb kiocb;
+        ssize_t         rval;
+        init_sync_kiocb(&kiocb, file);
+        kiocb.ki_pos = *ppos;
+        if (unlikely(file->f_flags & O_DIRECT))
+                ioflags |= IO_ISDIRECT;
+        VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+        *ppos = kiocb.ki_pos;
+        return rval;
+}
+STATIC ssize_t
+linvfs_readv(
+        struct file             *file,
+        const struct iovec      *iov,
+        unsigned long           nr_segs,
+        loff_t                  *ppos)
+{
+        return __linvfs_readv(file, iov, 0, nr_segs, ppos);
+}
+STATIC ssize_t
+linvfs_readv_invis(
+        struct file             *file,
+        const struct iovec      *iov,
+        unsigned long           nr_segs,
+        loff_t                  *ppos)
+{
+        return __linvfs_readv(file, iov, IO_INVIS, nr_segs, ppos);
+}
+STATIC inline ssize_t
+__linvfs_writev(
+        struct file             *file,
+        const struct iovec      *iov,
+        int                     ioflags,
+        unsigned long           nr_segs,
+        loff_t                  *ppos)
+{
+        struct inode    *inode = file->f_mapping->host;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        struct          kiocb kiocb;
+        ssize_t         rval;
+        init_sync_kiocb(&kiocb, file);
+        kiocb.ki_pos = *ppos;
+        if (unlikely(file->f_flags & O_DIRECT))
+                ioflags |= IO_ISDIRECT;
+        VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, rval);
+        *ppos = kiocb.ki_pos;
+        return rval;
+}
+STATIC ssize_t
+linvfs_writev(
+        struct file             *file,
+        const struct iovec      *iov,
+        unsigned long           nr_segs,
+        loff_t                  *ppos)
+{
+        return __linvfs_writev(file, iov, 0, nr_segs, ppos);
+}
+STATIC ssize_t
+linvfs_writev_invis(
+        struct file             *file,
+        const struct iovec      *iov,
+        unsigned long           nr_segs,
+        loff_t                  *ppos)
+{
+        return __linvfs_writev(file, iov, IO_INVIS, nr_segs, ppos);
+}
+STATIC ssize_t
+linvfs_sendfile(
+        struct file             *filp,
+        loff_t                  *ppos,
+        size_t                  count,
+        read_actor_t            actor,
+        void                    *target)
+{
+        vnode_t                 *vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
+        ssize_t                 rval;
+        VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval);
+        return rval;
+}
+STATIC int
+linvfs_open(
+        struct inode    *inode,
+        struct file     *filp)
+{
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        int             error;
+        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+                return -EFBIG;
+        ASSERT(vp);
+        VOP_OPEN(vp, NULL, error);
+        return -error;
+}
+STATIC int
+linvfs_release(
+        struct inode    *inode,
+        struct file     *filp)
+{
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        int             error = 0;
+        if (vp)
+                VOP_RELEASE(vp, error);
+        return -error;
+}
+STATIC int
+linvfs_fsync(
+        struct file     *filp,
+        struct dentry   *dentry,
+        int             datasync)
+{
+        struct inode    *inode = dentry->d_inode;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        int             error;
+        int             flags = FSYNC_WAIT;
+        if (datasync)
+                flags |= FSYNC_DATA;
+        ASSERT(vp);
+        VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error);
+        return -error;
+}
+/*
+ * linvfs_readdir maps to VOP_READDIR().
+ * We need to build a uio, cred, ...
+ */
+#define nextdp(dp)      ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
+STATIC int
+linvfs_readdir(
+        struct file     *filp,
+        void            *dirent,
+        filldir_t       filldir)
+{
+        int             error = 0;
+        vnode_t         *vp;
+        uio_t           uio;
+        iovec_t         iov;
+        int             eof = 0;
+        caddr_t         read_buf;
+        int             namelen, size = 0;
+        size_t          rlen = PAGE_CACHE_SIZE;
+        xfs_off_t       start_offset, curr_offset;
+        xfs_dirent_t    *dbp = NULL;
+        vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
+        ASSERT(vp);
+        /* Try fairly hard to get memory */
+        do {
+                if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
+                        break;
+                rlen >>= 1;
+        } while (rlen >= 1024);
+        if (read_buf == NULL)
+                return -ENOMEM;
+        uio.uio_iov = &iov;
+        uio.uio_segflg = UIO_SYSSPACE;
+        curr_offset = filp->f_pos;
+        if (filp->f_pos != 0x7fffffff)
+                uio.uio_offset = filp->f_pos;
+        else
+                uio.uio_offset = 0xffffffff;
+        while (!eof) {
+                uio.uio_resid = iov.iov_len = rlen;
+                iov.iov_base = read_buf;
+                uio.uio_iovcnt = 1;
+                start_offset = uio.uio_offset;
+                VOP_READDIR(vp, &uio, NULL, &eof, error);
+                if ((uio.uio_offset == start_offset) || error) {
+                        size = 0;
+                        break;
+                }
+                size = rlen - uio.uio_resid;
+                dbp = (xfs_dirent_t *)read_buf;
+                while (size > 0) {
+                        namelen = strlen(dbp->d_name);
+                        if (filldir(dirent, dbp->d_name, namelen,
+                                        (loff_t) curr_offset & 0x7fffffff,
+                                        (ino_t) dbp->d_ino,
+                                        DT_UNKNOWN)) {
+                                goto done;
+                        }
+                        size -= dbp->d_reclen;
+                        curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */;
+                        dbp = nextdp(dbp);
+                }
+        }
+done:
+        if (!error) {
+                if (size == 0)
+                        filp->f_pos = uio.uio_offset & 0x7fffffff;
+                else if (dbp)
+                        filp->f_pos = curr_offset;
+        }
+        kfree(read_buf);
+        return -error;
+}
+STATIC int
+linvfs_file_mmap(
+        struct file     *filp,
+        struct vm_area_struct *vma)
+{
+        struct inode    *ip = filp->f_dentry->d_inode;
+        vnode_t         *vp = LINVFS_GET_VP(ip);
+        vattr_t         va = { .va_mask = XFS_AT_UPDATIME };
+        int             error;
+        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
+                xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
+                error = -XFS_SEND_MMAP(mp, vma, 0);
+                if (error)
+                        return error;
+        }
+        vma->vm_ops = &linvfs_file_vm_ops;
+        VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error);
+        if (!error)
+                vn_revalidate(vp);      /* update Linux inode flags */
+        return 0;
+}
+STATIC long
+linvfs_ioctl(
+        struct file     *filp,
+        unsigned int    cmd,
+        unsigned long   arg)
+{
+        int             error;
+        struct inode *inode = filp->f_dentry->d_inode;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        VOP_IOCTL(vp, inode, filp, 0, cmd, (void __user *)arg, error);
+        VMODIFY(vp);
+        /* NOTE:  some of the ioctl's return positive #'s as a
+         *        byte count indicating success, such as
+         *        readlink_by_handle.  So we don't "sign flip"
+         *        like most other routines.  This means true
+         *        errors need to be returned as a negative value.
+         */
+        return error;
+}
+STATIC long
+linvfs_ioctl_invis(
+        struct file     *filp,
+        unsigned int    cmd,
+        unsigned long   arg)
+{
+        int             error;
+        struct inode *inode = filp->f_dentry->d_inode;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        ASSERT(vp);
+        VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error);
+        VMODIFY(vp);
+        /* NOTE:  some of the ioctl's return positive #'s as a
+         *        byte count indicating success, such as
+         *        readlink_by_handle.  So we don't "sign flip"
+         *        like most other routines.  This means true
+         *        errors need to be returned as a negative value.
+         */
+        return error;
+}
+#ifdef HAVE_VMOP_MPROTECT
+STATIC int
+linvfs_mprotect(
+        struct vm_area_struct *vma,
+        unsigned int    newflags)
+{
+        vnode_t         *vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode);
+        int             error = 0;
+        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
+                if ((vma->vm_flags & VM_MAYSHARE) &&
+                    (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) {
+                        xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
+                        error = XFS_SEND_MMAP(mp, vma, VM_WRITE);
+                    }
+        }
+        return error;
+}
+#endif /* HAVE_VMOP_MPROTECT */
+#ifdef HAVE_FOP_OPEN_EXEC
+/* If the user is attempting to execute a file that is offline then
+ * we have to trigger a DMAPI READ event before the file is marked as busy
+ * otherwise the invisible I/O will not be able to write to the file to bring
+ * it back online.
+ */
+STATIC int
+linvfs_open_exec(
+        struct inode    *inode)
+{
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
+        int             error = 0;
+        bhv_desc_t      *bdp;
+        xfs_inode_t     *ip;
+        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
+                bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
+                if (!bdp) {
+                        error = -EINVAL;
+                        goto open_exec_out;
+                }
+                ip = XFS_BHVTOI(bdp);
+                if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) {
+                        error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp,
+                                               0, 0, 0, NULL);
+                }
+        }
+open_exec_out:
+        return error;
+}
+#endif /* HAVE_FOP_OPEN_EXEC */
+struct file_operations linvfs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .readv          = linvfs_readv,
+        .writev         = linvfs_writev,
+        .aio_read       = linvfs_aio_read,
+        .aio_write      = linvfs_aio_write,
+        .sendfile       = linvfs_sendfile,
+        .unlocked_ioctl = linvfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = xfs_compat_ioctl,
+#endif
+        .mmap           = linvfs_file_mmap,
+        .open           = linvfs_open,
+        .release        = linvfs_release,
+        .fsync          = linvfs_fsync,
+#ifdef HAVE_FOP_OPEN_EXEC
+        .open_exec      = linvfs_open_exec,
+#endif
+};
+struct file_operations linvfs_invis_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .readv          = linvfs_readv_invis,
+        .writev         = linvfs_writev_invis,
+        .aio_read       = linvfs_aio_read_invis,
+        .aio_write      = linvfs_aio_write_invis,
+        .sendfile       = linvfs_sendfile,
+        .unlocked_ioctl = linvfs_ioctl_invis,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = xfs_compat_invis_ioctl,
+#endif
+        .mmap           = linvfs_file_mmap,
+        .open           = linvfs_open,
+        .release        = linvfs_release,
+        .fsync          = linvfs_fsync,
+};
+struct file_operations linvfs_dir_operations = {
+        .read           = generic_read_dir,
+        .readdir        = linvfs_readdir,
+        .unlocked_ioctl = linvfs_ioctl,
+        .fsync          = linvfs_fsync,
+};
+static struct vm_operations_struct linvfs_file_vm_ops = {
+        .nopage         = filemap_nopage,
+        .populate       = filemap_populate,
+#ifdef HAVE_VMOP_MPROTECT
+        .mprotect       = linvfs_mprotect,
+#endif
+};
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
new file mode 100644
index 000000000000..05ebd30ec96f
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+/*
+ * Stub for no-op vnode operations that return error status.
+ */
+int
+fs_noerr(void)
+{
+        return 0;
+}
+/*
+ * Operation unsupported under this file system.
+ */
+int
+fs_nosys(void)
+{
+        return ENOSYS;
+}
+/*
+ * Stub for inactive, strategy, and read/write lock/unlock.  Does nothing.
+ */
+/* ARGSUSED */
+void
+fs_noval(void)
+{
+}
+/*
+ * vnode pcache layer for vnode_tosspages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+void
+fs_tosspages(
+        bhv_desc_t      *bdp,
+        xfs_off_t       first,
+        xfs_off_t       last,
+        int             fiopt)
+{
+        vnode_t         *vp = BHV_TO_VNODE(bdp);
+        struct inode    *ip = LINVFS_GET_IP(vp);
+        if (VN_CACHED(vp))
+                truncate_inode_pages(ip->i_mapping, first);
+}
+/*
+ * vnode pcache layer for vnode_flushinval_pages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+void
+fs_flushinval_pages(
+        bhv_desc_t      *bdp,
+        xfs_off_t       first,
+        xfs_off_t       last,
+        int             fiopt)
+{
+        vnode_t         *vp = BHV_TO_VNODE(bdp);
+        struct inode    *ip = LINVFS_GET_IP(vp);
+        if (VN_CACHED(vp)) {
+                filemap_fdatawrite(ip->i_mapping);
+                filemap_fdatawait(ip->i_mapping);
+                truncate_inode_pages(ip->i_mapping, first);
+        }
+}
+/*
+ * vnode pcache layer for vnode_flush_pages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+int
+fs_flush_pages(
+        bhv_desc_t      *bdp,
+        xfs_off_t       first,
+        xfs_off_t       last,
+        uint64_t        flags,
+        int             fiopt)
+{
+        vnode_t         *vp = BHV_TO_VNODE(bdp);
+        struct inode    *ip = LINVFS_GET_IP(vp);
+        if (VN_CACHED(vp)) {
+                filemap_fdatawrite(ip->i_mapping);
+                filemap_fdatawait(ip->i_mapping);
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h
new file mode 100644
index 000000000000..2db9ddbd4567
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUBR_H__
+#define __XFS_SUBR_H__
+/*
+ * Utilities shared among file system implementations.
+ */
+struct cred;
+extern int      fs_noerr(void);
+extern int      fs_nosys(void);
+extern void     fs_noval(void);
+extern void     fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern void     fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern int      fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
+#endif  /* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
new file mode 100644
index 000000000000..a6da5b4fd240
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * This file contains globals needed by XFS that were normally defined
+ * somewhere else in IRIX.
+ */
+#include "xfs.h"
+#include "xfs_cred.h"
+#include "xfs_sysctl.h"
+/*
+ * System memory size - used to scale certain data structures in XFS.
+ */
+unsigned long xfs_physmem;
+/*
+ * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
+ * other XFS code uses these values.  Times are measured in centisecs (i.e.
+ * 100ths of a second).
+ */
+xfs_param_t xfs_params = {
+                          /*    MIN             DFLT            MAX     */
+        .restrict_chown = {     0,              1,              1       },
+        .sgid_inherit   = {     0,              0,              1       },
+        .symlink_mode   = {     0,              0,              1       },
+        .panic_mask     = {     0,              0,              127     },
+        .error_level    = {     0,              3,              11      },
+        .syncd_timer    = {     1*100,          30*100,         7200*100},
+        .stats_clear    = {     0,              0,              1       },
+        .inherit_sync   = {     0,              1,              1       },
+        .inherit_nodump = {     0,              1,              1       },
+        .inherit_noatim = {     0,              1,              1       },
+        .xfs_buf_timer  = {     100/2,          1*100,          30*100  },
+        .xfs_buf_age    = {     1*100,          15*100,         7200*100},
+        .inherit_nosym  = {     0,              0,              1       },
+        .rotorstep      = {     1,              1,              255     },
+};
+/*
+ * Global system credential structure.
+ */
+cred_t sys_cred_val, *sys_cred = &sys_cred_val;
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
new file mode 100644
index 000000000000..e81e2f38a853
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_GLOBALS_H__
+#define __XFS_GLOBALS_H__
+/*
+ * This file declares globals needed by XFS that were normally defined
+ * somewhere else in IRIX.
+ */
+extern uint64_t xfs_panic_mask;         /* set to cause more panics */
+extern unsigned long xfs_physmem;
+extern struct cred *sys_cred;
+#endif  /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
new file mode 100644
index 000000000000..69809eef8a54
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -0,0 +1,1336 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_dfrag.h"
+#include "xfs_fsops.h"
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ *    returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ *    returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ *    returns full handle for a path
+ */
+STATIC int
+xfs_find_handle(
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        int                     hsize;
+        xfs_handle_t            handle;
+        xfs_fsop_handlereq_t    hreq;
+        struct inode            *inode;
+        struct vnode            *vp;
+        if (copy_from_user(&hreq, arg, sizeof(hreq)))
+                return -XFS_ERROR(EFAULT);
+        memset((char *)&handle, 0, sizeof(handle));
+        switch (cmd) {
+        case XFS_IOC_PATH_TO_FSHANDLE:
+        case XFS_IOC_PATH_TO_HANDLE: {
+                struct nameidata        nd;
+                int                     error;
+                error = user_path_walk_link((const char __user *)hreq.path, &nd);
+                if (error)
+                        return error;
+                ASSERT(nd.dentry);
+                ASSERT(nd.dentry->d_inode);
+                inode = igrab(nd.dentry->d_inode);
+                path_release(&nd);
+                break;
+        }
+        case XFS_IOC_FD_TO_HANDLE: {
+                struct file     *file;
+                file = fget(hreq.fd);
+                if (!file)
+                    return -EBADF;
+                ASSERT(file->f_dentry);
+                ASSERT(file->f_dentry->d_inode);
+                inode = igrab(file->f_dentry->d_inode);
+                fput(file);
+                break;
+        }
+        default:
+                ASSERT(0);
+                return -XFS_ERROR(EINVAL);
+        }
+        if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
+                /* we're not in XFS anymore, Toto */
+                iput(inode);
+                return -XFS_ERROR(EINVAL);
+        }
+        /* we need the vnode */
+        vp = LINVFS_GET_VP(inode);
+        if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
+                iput(inode);
+                return -XFS_ERROR(EBADF);
+        }
+        /* now we can grab the fsid */
+        memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
+        hsize = sizeof(xfs_fsid_t);
+        if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
+                xfs_inode_t     *ip;
+                bhv_desc_t      *bhv;
+                int             lock_mode;
+                /* need to get access to the xfs_inode to read the generation */
+                bhv = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
+                ASSERT(bhv);
+                ip = XFS_BHVTOI(bhv);
+                ASSERT(ip);
+                lock_mode = xfs_ilock_map_shared(ip);
+                /* fill in fid section of handle from inode */
+                handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) -
+                                            sizeof(handle.ha_fid.xfs_fid_len);
+                handle.ha_fid.xfs_fid_pad = 0;
+                handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen;
+                handle.ha_fid.xfs_fid_ino = ip->i_ino;
+                xfs_iunlock_map_shared(ip, lock_mode);
+                hsize = XFS_HSIZE(handle);
+        }
+        /* now copy our handle into the user buffer & write out the size */
+        if (copy_to_user(hreq.ohandle, &handle, hsize) ||
+            copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+                iput(inode);
+                return -XFS_ERROR(EFAULT);
+        }
+        iput(inode);
+        return 0;
+}
+/*
+ * Convert userspace handle data into vnode (and inode).
+ * We [ab]use the fact that all the fsop_handlereq ioctl calls
+ * have a data structure argument whose first component is always
+ * a xfs_fsop_handlereq_t, so we can cast to and from this type.
+ * This allows us to optimise the copy_from_user calls and gives
+ * a handy, shared routine.
+ *
+ * If no error, caller must always VN_RELE the returned vp.
+ */
+STATIC int
+xfs_vget_fsop_handlereq(
+        xfs_mount_t             *mp,
+        struct inode            *parinode,      /* parent inode pointer    */
+        xfs_fsop_handlereq_t    *hreq,
+        vnode_t                 **vp,
+        struct inode            **inode)
+{
+        void                    __user *hanp;
+        size_t                  hlen;
+        xfs_fid_t               *xfid;
+        xfs_handle_t            *handlep;
+        xfs_handle_t            handle;
+        xfs_inode_t             *ip;
+        struct inode            *inodep;
+        vnode_t                 *vpp;
+        xfs_ino_t               ino;
+        __u32                   igen;
+        int                     error;
+        /*
+         * Only allow handle opens under a directory.
+         */
+        if (!S_ISDIR(parinode->i_mode))
+                return XFS_ERROR(ENOTDIR);
+        hanp = hreq->ihandle;
+        hlen = hreq->ihandlen;
+        handlep = &handle;
+        if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+                return XFS_ERROR(EINVAL);
+        if (copy_from_user(handlep, hanp, hlen))
+                return XFS_ERROR(EFAULT);
+        if (hlen < sizeof(*handlep))
+                memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
+        if (hlen > sizeof(handlep->ha_fsid)) {
+                if (handlep->ha_fid.xfs_fid_len !=
+                                (hlen - sizeof(handlep->ha_fsid)
+                                        - sizeof(handlep->ha_fid.xfs_fid_len))
+                    || handlep->ha_fid.xfs_fid_pad)
+                        return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Crack the handle, obtain the inode # & generation #
+         */
+        xfid = (struct xfs_fid *)&handlep->ha_fid;
+        if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
+                ino  = xfid->xfs_fid_ino;
+                igen = xfid->xfs_fid_gen;
+        } else {
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Get the XFS inode, building a vnode to go with it.
+         */
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+        if (error)
+                return error;
+        if (ip == NULL)
+                return XFS_ERROR(EIO);
+        if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
+                xfs_iput_new(ip, XFS_ILOCK_SHARED);
+                return XFS_ERROR(ENOENT);
+        }
+        vpp = XFS_ITOV(ip);
+        inodep = LINVFS_GET_IP(vpp);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        *vp = vpp;
+        *inode = inodep;
+        return 0;
+}
+STATIC int
+xfs_open_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct file             *parfilp,
+        struct inode            *parinode)
+{
+        int                     error;
+        int                     new_fd;
+        int                     permflag;
+        struct file             *filp;
+        struct inode            *inode;
+        struct dentry           *dentry;
+        vnode_t                 *vp;
+        xfs_fsop_handlereq_t    hreq;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
+        if (error)
+                return -error;
+        /* Restrict xfs_open_by_handle to directories & regular files. */
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+                iput(inode);
+                return -XFS_ERROR(EINVAL);
+        }
+#if BITS_PER_LONG != 32
+        hreq.oflags |= O_LARGEFILE;
+#endif
+        /* Put open permission in namei format. */
+        permflag = hreq.oflags;
+        if ((permflag+1) & O_ACCMODE)
+                permflag++;
+        if (permflag & O_TRUNC)
+                permflag |= 2;
+        if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+            (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
+                iput(inode);
+                return -XFS_ERROR(EPERM);
+        }
+        if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+                iput(inode);
+                return -XFS_ERROR(EACCES);
+        }
+        /* Can't write directories. */
+        if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+                iput(inode);
+                return -XFS_ERROR(EISDIR);
+        }
+        if ((new_fd = get_unused_fd()) < 0) {
+                iput(inode);
+                return new_fd;
+        }
+        dentry = d_alloc_anon(inode);
+        if (dentry == NULL) {
+                iput(inode);
+                put_unused_fd(new_fd);
+                return -XFS_ERROR(ENOMEM);
+        }
+        /* Ensure umount returns EBUSY on umounts while this file is open. */
+        mntget(parfilp->f_vfsmnt);
+        /* Create file pointer. */
+        filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags);
+        if (IS_ERR(filp)) {
+                put_unused_fd(new_fd);
+                return -XFS_ERROR(-PTR_ERR(filp));
+        }
+        if (inode->i_mode & S_IFREG)
+                filp->f_op = &linvfs_invis_file_operations;
+        fd_install(new_fd, filp);
+        return new_fd;
+}
+STATIC int
+xfs_readlink_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct file             *parfilp,
+        struct inode            *parinode)
+{
+        int                     error;
+        struct iovec            aiov;
+        struct uio              auio;
+        struct inode            *inode;
+        xfs_fsop_handlereq_t    hreq;
+        vnode_t                 *vp;
+        __u32                   olen;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode);
+        if (error)
+                return -error;
+        /* Restrict this handle operation to symlinks only. */
+        if (vp->v_type != VLNK) {
+                VN_RELE(vp);
+                return -XFS_ERROR(EINVAL);
+        }
+        if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+                VN_RELE(vp);
+                return -XFS_ERROR(EFAULT);
+        }
+        aiov.iov_len    = olen;
+        aiov.iov_base   = hreq.ohandle;
+        auio.uio_iov    = &aiov;
+        auio.uio_iovcnt = 1;
+        auio.uio_offset = 0;
+        auio.uio_segflg = UIO_USERSPACE;
+        auio.uio_resid  = olen;
+        VOP_READLINK(vp, &auio, IO_INVIS, NULL, error);
+        VN_RELE(vp);
+        return (olen - auio.uio_resid);
+}
+STATIC int
+xfs_fssetdm_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct file             *parfilp,
+        struct inode            *parinode)
+{
+        int                     error;
+        struct fsdmidata        fsd;
+        xfs_fsop_setdm_handlereq_t dmhreq;
+        struct inode            *inode;
+        bhv_desc_t              *bdp;
+        vnode_t                 *vp;
+        if (!capable(CAP_MKNOD))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode);
+        if (error)
+                return -error;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+                VN_RELE(vp);
+                return -XFS_ERROR(EPERM);
+        }
+        if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+                VN_RELE(vp);
+                return -XFS_ERROR(EFAULT);
+        }
+        bdp = bhv_base_unlocked(VN_BHV_HEAD(vp));
+        error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL);
+        VN_RELE(vp);
+        if (error)
+                return -error;
+        return 0;
+}
+STATIC int
+xfs_attrlist_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct file             *parfilp,
+        struct inode            *parinode)
+{
+        int                     error;
+        attrlist_cursor_kern_t  *cursor;
+        xfs_fsop_attrlist_handlereq_t al_hreq;
+        struct inode            *inode;
+        vnode_t                 *vp;
+        char                    *kbuf;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        if (al_hreq.buflen > XATTR_LIST_MAX)
+                return -XFS_ERROR(EINVAL);
+        error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq,
+                        &vp, &inode);
+        if (error)
+                goto out;
+        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        if (!kbuf)
+                goto out_vn_rele;
+        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+        VOP_ATTR_LIST(vp, kbuf, al_hreq.buflen, al_hreq.flags,
+                        cursor, NULL, error);
+        if (error)
+                goto out_kfree;
+        if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
+                error = -EFAULT;
+ out_kfree:
+        kfree(kbuf);
+ out_vn_rele:
+        VN_RELE(vp);
+ out:
+        return -error;
+}
+STATIC int
+xfs_attrmulti_attr_get(
+        struct vnode            *vp,
+        char                    *name,
+        char                    __user *ubuf,
+        __uint32_t              *len,
+        __uint32_t              flags)
+{
+        char                    *kbuf;
+        int                     error = EFAULT;
+        
+        if (*len > XATTR_SIZE_MAX)
+                return EINVAL;
+        kbuf = kmalloc(*len, GFP_KERNEL);
+        if (!kbuf)
+                return ENOMEM;
+        VOP_ATTR_GET(vp, name, kbuf, len, flags, NULL, error);
+        if (error)
+                goto out_kfree;
+        if (copy_to_user(ubuf, kbuf, *len))
+                error = EFAULT;
+ out_kfree:
+        kfree(kbuf);
+        return error;
+}
+STATIC int
+xfs_attrmulti_attr_set(
+        struct vnode            *vp,
+        char                    *name,
+        const char              __user *ubuf,
+        __uint32_t              len,
+        __uint32_t              flags)
+{
+        char                    *kbuf;
+        int                     error = EFAULT;
+        if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
+                return EPERM;
+        if (len > XATTR_SIZE_MAX)
+                return EINVAL;
+        kbuf = kmalloc(len, GFP_KERNEL);
+        if (!kbuf)
+                return ENOMEM;
+        if (copy_from_user(kbuf, ubuf, len))
+                goto out_kfree;
+                        
+        VOP_ATTR_SET(vp, name, kbuf, len, flags, NULL, error);
+ out_kfree:
+        kfree(kbuf);
+        return error;
+}
+STATIC int
+xfs_attrmulti_attr_remove(
+        struct vnode            *vp,
+        char                    *name,
+        __uint32_t              flags)
+{
+        int                     error;
+        if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode))
+                return EPERM;
+        VOP_ATTR_REMOVE(vp, name, flags, NULL, error);
+        return error;
+}
+STATIC int
+xfs_attrmulti_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct file             *parfilp,
+        struct inode            *parinode)
+{
+        int                     error;
+        xfs_attr_multiop_t      *ops;
+        xfs_fsop_attrmulti_handlereq_t am_hreq;
+        struct inode            *inode;
+        vnode_t                 *vp;
+        unsigned int            i, size;
+        char                    *attr_name;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode);
+        if (error)
+                goto out;
+        error = E2BIG;
+        size = am_hreq.opcount * sizeof(attr_multiop_t);
+        if (!size || size > 16 * PAGE_SIZE)
+                goto out_vn_rele;
+        error = ENOMEM;
+        ops = kmalloc(size, GFP_KERNEL);
+        if (!ops)
+                goto out_vn_rele;
+        error = EFAULT;
+        if (copy_from_user(ops, am_hreq.ops, size))
+                goto out_kfree_ops;
+        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+        if (!attr_name)
+                goto out_kfree_ops;
+        error = 0;
+        for (i = 0; i < am_hreq.opcount; i++) {
+                ops[i].am_error = strncpy_from_user(attr_name,
+                                ops[i].am_attrname, MAXNAMELEN);
+                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+                        error = -ERANGE;
+                if (ops[i].am_error < 0)
+                        break;
+                switch (ops[i].am_opcode) {
+                case ATTR_OP_GET:
+                        ops[i].am_error = xfs_attrmulti_attr_get(vp,
+                                        attr_name, ops[i].am_attrvalue,
+                                        &ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_SET:
+                        ops[i].am_error = xfs_attrmulti_attr_set(vp,
+                                        attr_name, ops[i].am_attrvalue,
+                                        ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_REMOVE:
+                        ops[i].am_error = xfs_attrmulti_attr_remove(vp,
+                                        attr_name, ops[i].am_flags);
+                        break;
+                default:
+                        ops[i].am_error = EINVAL;
+                }
+        }
+        if (copy_to_user(am_hreq.ops, ops, size))
+                error = XFS_ERROR(EFAULT);
+        kfree(attr_name);
+ out_kfree_ops:
+        kfree(ops);
+ out_vn_rele:
+        VN_RELE(vp);
+ out:
+        return -error;
+}
+/* prototypes for a few of the stack-hungry cases that have
+ * their own functions.  Functions are defined after their use
+ * so gcc doesn't get fancy and inline them with -03 */
+STATIC int
+xfs_ioc_space(
+        bhv_desc_t              *bdp,
+        vnode_t                 *vp,
+        struct file             *filp,
+        int                     flags,
+        unsigned int            cmd,
+        void                    __user *arg);
+STATIC int
+xfs_ioc_bulkstat(
+        xfs_mount_t             *mp,
+        unsigned int            cmd,
+        void                    __user *arg);
+STATIC int
+xfs_ioc_fsgeometry_v1(
+        xfs_mount_t             *mp,
+        void                    __user *arg);
+STATIC int
+xfs_ioc_fsgeometry(
+        xfs_mount_t             *mp,
+        void                    __user *arg);
+STATIC int
+xfs_ioc_xattr(
+        vnode_t                 *vp,
+        xfs_inode_t             *ip,
+        struct file             *filp,
+        unsigned int            cmd,
+        void                    __user *arg);
+STATIC int
+xfs_ioc_getbmap(
+        bhv_desc_t              *bdp,
+        struct file             *filp,
+        int                     flags,
+        unsigned int            cmd,
+        void                    __user *arg);
+STATIC int
+xfs_ioc_getbmapx(
+        bhv_desc_t              *bdp,
+        void                    __user *arg);
+int
+xfs_ioctl(
+        bhv_desc_t              *bdp,
+        struct inode            *inode,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        int                     error;
+        vnode_t                 *vp;
+        xfs_inode_t             *ip;
+        xfs_mount_t             *mp;
+        vp = LINVFS_GET_VP(inode);
+        vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address);
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        switch (cmd) {
+        case XFS_IOC_ALLOCSP:
+        case XFS_IOC_FREESP:
+        case XFS_IOC_RESVSP:
+        case XFS_IOC_UNRESVSP:
+        case XFS_IOC_ALLOCSP64:
+        case XFS_IOC_FREESP64:
+        case XFS_IOC_RESVSP64:
+        case XFS_IOC_UNRESVSP64:
+                /*
+                 * Only allow the sys admin to reserve space unless
+                 * unwritten extents are enabled.
+                 */
+                if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
+                    !capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg);
+        case XFS_IOC_DIOINFO: {
+                struct dioattr  da;
+                xfs_buftarg_t   *target =
+                        (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+                        mp->m_rtdev_targp : mp->m_ddev_targp;
+                da.d_mem = da.d_miniosz = 1 << target->pbr_sshift;
+                /* The size dio will do in one go */
+                da.d_maxiosz = 64 * PAGE_CACHE_SIZE;
+                if (copy_to_user(arg, &da, sizeof(da)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_FSBULKSTAT_SINGLE:
+        case XFS_IOC_FSBULKSTAT:
+        case XFS_IOC_FSINUMBERS:
+                return xfs_ioc_bulkstat(mp, cmd, arg);
+        case XFS_IOC_FSGEOMETRY_V1:
+                return xfs_ioc_fsgeometry_v1(mp, arg);
+        case XFS_IOC_FSGEOMETRY:
+                return xfs_ioc_fsgeometry(mp, arg);
+        case XFS_IOC_GETVERSION:
+        case XFS_IOC_GETXFLAGS:
+        case XFS_IOC_SETXFLAGS:
+        case XFS_IOC_FSGETXATTR:
+        case XFS_IOC_FSSETXATTR:
+        case XFS_IOC_FSGETXATTRA:
+                return xfs_ioc_xattr(vp, ip, filp, cmd, arg);
+        case XFS_IOC_FSSETDM: {
+                struct fsdmidata        dmi;
+                if (copy_from_user(&dmi, arg, sizeof(dmi)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate,
+                                                        NULL);
+                return -error;
+        }
+        case XFS_IOC_GETBMAP:
+        case XFS_IOC_GETBMAPA:
+                return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg);
+        case XFS_IOC_GETBMAPX:
+                return xfs_ioc_getbmapx(bdp, arg);
+        case XFS_IOC_FD_TO_HANDLE:
+        case XFS_IOC_PATH_TO_HANDLE:
+        case XFS_IOC_PATH_TO_FSHANDLE:
+                return xfs_find_handle(cmd, arg);
+        case XFS_IOC_OPEN_BY_HANDLE:
+                return xfs_open_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_FSSETDM_BY_HANDLE:
+                return xfs_fssetdm_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_READLINK_BY_HANDLE:
+                return xfs_readlink_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_ATTRLIST_BY_HANDLE:
+                return xfs_attrlist_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_ATTRMULTI_BY_HANDLE:
+                return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+        case XFS_IOC_SWAPEXT: {
+                error = xfs_swapext((struct xfs_swapext __user *)arg);
+                return -error;
+        }
+        case XFS_IOC_FSCOUNTS: {
+                xfs_fsop_counts_t out;
+                error = xfs_fs_counts(mp, &out);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &out, sizeof(out)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_SET_RESBLKS: {
+                xfs_fsop_resblks_t inout;
+                __uint64_t         in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&inout, arg, sizeof(inout)))
+                        return -XFS_ERROR(EFAULT);
+                /* input parameter is passed in resblks field of structure */
+                in = inout.resblks;
+                error = xfs_reserve_blocks(mp, &in, &inout);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &inout, sizeof(inout)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_GET_RESBLKS: {
+                xfs_fsop_resblks_t out;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                error = xfs_reserve_blocks(mp, NULL, &out);
+                if (error)
+                        return -error;
+                if (copy_to_user(arg, &out, sizeof(out)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_FSGROWFSDATA: {
+                xfs_growfs_data_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_data(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSLOG: {
+                xfs_growfs_log_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_log(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSRT: {
+                xfs_growfs_rt_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_rt(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FREEZE:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (inode->i_sb->s_frozen == SB_UNFROZEN)
+                        freeze_bdev(inode->i_sb->s_bdev);
+                return 0;
+        case XFS_IOC_THAW:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (inode->i_sb->s_frozen != SB_UNFROZEN)
+                        thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
+                return 0;
+        case XFS_IOC_GOINGDOWN: {
+                __uint32_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (get_user(in, (__uint32_t __user *)arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_fs_goingdown(mp, in);
+                return -error;
+        }
+        case XFS_IOC_ERROR_INJECTION: {
+                xfs_error_injection_t in;
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                if (copy_from_user(&in, arg, sizeof(in)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_errortag_add(in.errtag, mp);
+                return -error;
+        }
+        case XFS_IOC_ERROR_CLEARALL:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                error = xfs_errortag_clearall(mp);
+                return -error;
+        default:
+                return -ENOTTY;
+        }
+}
+STATIC int
+xfs_ioc_space(
+        bhv_desc_t              *bdp,
+        vnode_t                 *vp,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        xfs_flock64_t           bf;
+        int                     attr_flags = 0;
+        int                     error;
+        if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+                return -XFS_ERROR(EPERM);
+        if (!(filp->f_flags & FMODE_WRITE))
+                return -XFS_ERROR(EBADF);
+        if (vp->v_type != VREG)
+                return -XFS_ERROR(EINVAL);
+        if (copy_from_user(&bf, arg, sizeof(bf)))
+                return -XFS_ERROR(EFAULT);
+        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+                attr_flags |= ATTR_NONBLOCK;
+        if (ioflags & IO_INVIS)
+                attr_flags |= ATTR_DMI;
+        error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos,
+                                              NULL, attr_flags);
+        return -error;
+}
+STATIC int
+xfs_ioc_bulkstat(
+        xfs_mount_t             *mp,
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        xfs_fsop_bulkreq_t      bulkreq;
+        int                     count;  /* # of records returned */
+        xfs_ino_t               inlast; /* last inode number */
+        int                     done;
+        int                     error;
+        /* done = 1 if there are more stats to get and if bulkstat */
+        /* should be called again (unused here, but used in dmapi) */
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -XFS_ERROR(EIO);
+        if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+                return -XFS_ERROR(EFAULT);
+        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+                return -XFS_ERROR(EFAULT);
+        if ((count = bulkreq.icount) <= 0)
+                return -XFS_ERROR(EINVAL);
+        if (cmd == XFS_IOC_FSINUMBERS)
+                error = xfs_inumbers(mp, &inlast, &count,
+                                                bulkreq.ubuffer);
+        else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
+                error = xfs_bulkstat_single(mp, &inlast,
+                                                bulkreq.ubuffer, &done);
+        else {  /* XFS_IOC_FSBULKSTAT */
+                if (count == 1 && inlast != 0) {
+                        inlast++;
+                        error = xfs_bulkstat_single(mp, &inlast,
+                                        bulkreq.ubuffer, &done);
+                } else {
+                        error = xfs_bulkstat(mp, &inlast, &count,
+                                (bulkstat_one_pf)xfs_bulkstat_one, NULL,
+                                sizeof(xfs_bstat_t), bulkreq.ubuffer,
+                                BULKSTAT_FG_QUICK, &done);
+                }
+        }
+        if (error)
+                return -error;
+        if (bulkreq.ocount != NULL) {
+                if (copy_to_user(bulkreq.lastip, &inlast,
+                                                sizeof(xfs_ino_t)))
+                        return -XFS_ERROR(EFAULT);
+                if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+                        return -XFS_ERROR(EFAULT);
+        }
+        return 0;
+}
+STATIC int
+xfs_ioc_fsgeometry_v1(
+        xfs_mount_t             *mp,
+        void                    __user *arg)
+{
+        xfs_fsop_geom_v1_t      fsgeo;
+        int                     error;
+        error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+        if (error)
+                return -error;
+        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_ioc_fsgeometry(
+        xfs_mount_t             *mp,
+        void                    __user *arg)
+{
+        xfs_fsop_geom_t         fsgeo;
+        int                     error;
+        error = xfs_fs_geometry(mp, &fsgeo, 4);
+        if (error)
+                return -error;
+        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+/*
+ * Linux extended inode flags interface.
+ */
+#define LINUX_XFLAG_SYNC        0x00000008 /* Synchronous updates */
+#define LINUX_XFLAG_IMMUTABLE   0x00000010 /* Immutable file */
+#define LINUX_XFLAG_APPEND      0x00000020 /* writes to file may only append */
+#define LINUX_XFLAG_NODUMP      0x00000040 /* do not dump file */
+#define LINUX_XFLAG_NOATIME     0x00000080 /* do not update atime */
+STATIC unsigned int
+xfs_merge_ioc_xflags(
+        unsigned int    flags,
+        unsigned int    start)
+{
+        unsigned int    xflags = start;
+        if (flags & LINUX_XFLAG_IMMUTABLE)
+                xflags |= XFS_XFLAG_IMMUTABLE;
+        else
+                xflags &= ~XFS_XFLAG_IMMUTABLE;
+        if (flags & LINUX_XFLAG_APPEND)
+                xflags |= XFS_XFLAG_APPEND;
+        else
+                xflags &= ~XFS_XFLAG_APPEND;
+        if (flags & LINUX_XFLAG_SYNC)
+                xflags |= XFS_XFLAG_SYNC;
+        else
+                xflags &= ~XFS_XFLAG_SYNC;
+        if (flags & LINUX_XFLAG_NOATIME)
+                xflags |= XFS_XFLAG_NOATIME;
+        else
+                xflags &= ~XFS_XFLAG_NOATIME;
+        if (flags & LINUX_XFLAG_NODUMP)
+                xflags |= XFS_XFLAG_NODUMP;
+        else
+                xflags &= ~XFS_XFLAG_NODUMP;
+        return xflags;
+}
+STATIC unsigned int
+xfs_di2lxflags(
+        __uint16_t      di_flags)
+{
+        unsigned int    flags = 0;
+        if (di_flags & XFS_DIFLAG_IMMUTABLE)
+                flags |= LINUX_XFLAG_IMMUTABLE;
+        if (di_flags & XFS_DIFLAG_APPEND)
+                flags |= LINUX_XFLAG_APPEND;
+        if (di_flags & XFS_DIFLAG_SYNC)
+                flags |= LINUX_XFLAG_SYNC;
+        if (di_flags & XFS_DIFLAG_NOATIME)
+                flags |= LINUX_XFLAG_NOATIME;
+        if (di_flags & XFS_DIFLAG_NODUMP)
+                flags |= LINUX_XFLAG_NODUMP;
+        return flags;
+}
+STATIC int
+xfs_ioc_xattr(
+        vnode_t                 *vp,
+        xfs_inode_t             *ip,
+        struct file             *filp,
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        struct fsxattr          fa;
+        vattr_t                 va;
+        int                     error;
+        int                     attr_flags;
+        unsigned int            flags;
+        switch (cmd) {
+        case XFS_IOC_FSGETXATTR: {
+                va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS;
+                VOP_GETATTR(vp, &va, 0, NULL, error);
+                if (error)
+                        return -error;
+                fa.fsx_xflags   = va.va_xflags;
+                fa.fsx_extsize  = va.va_extsize;
+                fa.fsx_nextents = va.va_nextents;
+                if (copy_to_user(arg, &fa, sizeof(fa)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_FSSETXATTR: {
+                if (copy_from_user(&fa, arg, sizeof(fa)))
+                        return -XFS_ERROR(EFAULT);
+                attr_flags = 0;
+                if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+                        attr_flags |= ATTR_NONBLOCK;
+                va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE;
+                va.va_xflags  = fa.fsx_xflags;
+                va.va_extsize = fa.fsx_extsize;
+                VOP_SETATTR(vp, &va, attr_flags, NULL, error);
+                if (!error)
+                        vn_revalidate(vp);      /* update Linux inode flags */
+                return -error;
+        }
+        case XFS_IOC_FSGETXATTRA: {
+                va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_ANEXTENTS;
+                VOP_GETATTR(vp, &va, 0, NULL, error);
+                if (error)
+                        return -error;
+                fa.fsx_xflags   = va.va_xflags;
+                fa.fsx_extsize  = va.va_extsize;
+                fa.fsx_nextents = va.va_anextents;
+                if (copy_to_user(arg, &fa, sizeof(fa)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_GETXFLAGS: {
+                flags = xfs_di2lxflags(ip->i_d.di_flags);
+                if (copy_to_user(arg, &flags, sizeof(flags)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        case XFS_IOC_SETXFLAGS: {
+                if (copy_from_user(&flags, arg, sizeof(flags)))
+                        return -XFS_ERROR(EFAULT);
+                if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \
+                              LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \
+                              LINUX_XFLAG_SYNC))
+                        return -XFS_ERROR(EOPNOTSUPP);
+                attr_flags = 0;
+                if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+                        attr_flags |= ATTR_NONBLOCK;
+                va.va_mask = XFS_AT_XFLAGS;
+                va.va_xflags = xfs_merge_ioc_xflags(flags,
+                                xfs_ip2xflags(ip));
+                VOP_SETATTR(vp, &va, attr_flags, NULL, error);
+                if (!error)
+                        vn_revalidate(vp);      /* update Linux inode flags */
+                return -error;
+        }
+        case XFS_IOC_GETVERSION: {
+                flags = LINVFS_GET_IP(vp)->i_generation;
+                if (copy_to_user(arg, &flags, sizeof(flags)))
+                        return -XFS_ERROR(EFAULT);
+                return 0;
+        }
+        default:
+                return -ENOTTY;
+        }
+}
+STATIC int
+xfs_ioc_getbmap(
+        bhv_desc_t              *bdp,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        void                    __user *arg)
+{
+        struct getbmap          bm;
+        int                     iflags;
+        int                     error;
+        if (copy_from_user(&bm, arg, sizeof(bm)))
+                return -XFS_ERROR(EFAULT);
+        if (bm.bmv_count < 2)
+                return -XFS_ERROR(EINVAL);
+        iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+        if (ioflags & IO_INVIS)
+                iflags |= BMV_IF_NO_DMAPI_READ;
+        error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags);
+        if (error)
+                return -error;
+        if (copy_to_user(arg, &bm, sizeof(bm)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_ioc_getbmapx(
+        bhv_desc_t              *bdp,
+        void                    __user *arg)
+{
+        struct getbmapx         bmx;
+        struct getbmap          bm;
+        int                     iflags;
+        int                     error;
+        if (copy_from_user(&bmx, arg, sizeof(bmx)))
+                return -XFS_ERROR(EFAULT);
+        if (bmx.bmv_count < 2)
+                return -XFS_ERROR(EINVAL);
+        /*
+         * Map input getbmapx structure to a getbmap
+         * structure for xfs_getbmap.
+         */
+        GETBMAP_CONVERT(bmx, bm);
+        iflags = bmx.bmv_iflags;
+        if (iflags & (~BMV_IF_VALID))
+                return -XFS_ERROR(EINVAL);
+        iflags |= BMV_IF_EXTENDED;
+        error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags);
+        if (error)
+                return -error;
+        GETBMAP_CONVERT(bm, bmx);
+        if (copy_to_user(arg, &bmx, sizeof(bmx)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
new file mode 100644
index 000000000000..7a12c83184f5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include <linux/config.h>
+#include <linux/compat.h>
+#include <linux/init.h>
+#include <linux/ioctl.h>
+#include <linux/ioctl32.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_fs.h"
+#include "xfs_vfs.h"
+#include "xfs_vnode.h"
+#include "xfs_dfrag.h"
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#else
+typedef struct xfs_fsop_bulkreq32 {
+        compat_uptr_t   lastip;         /* last inode # pointer         */
+        __s32           icount;         /* count of entries in buffer   */
+        compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+        __s32           ocount;         /* output count pointer         */
+} xfs_fsop_bulkreq32_t;
+static unsigned long
+xfs_ioctl32_bulkstat(unsigned long arg)
+{
+        xfs_fsop_bulkreq32_t    __user *p32 = (void __user *)arg;
+        xfs_fsop_bulkreq_t      __user *p = compat_alloc_user_space(sizeof(*p));
+        u32                     addr;
+        if (get_user(addr, &p32->lastip) ||
+            put_user(compat_ptr(addr), &p->lastip) ||
+            copy_in_user(&p->icount, &p32->icount, sizeof(s32)) ||
+            get_user(addr, &p32->ubuffer) ||
+            put_user(compat_ptr(addr), &p->ubuffer) ||
+            get_user(addr, &p32->ocount) ||
+            put_user(compat_ptr(addr), &p->ocount))
+                return -EFAULT;
+        return (unsigned long)p;
+}
+#endif
+static long
+__xfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
+{
+        int             error;
+        struct inode *inode = f->f_dentry->d_inode;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        switch (cmd) {
+        case XFS_IOC_DIOINFO:
+        case XFS_IOC_FSGEOMETRY_V1:
+        case XFS_IOC_FSGEOMETRY:
+        case XFS_IOC_GETVERSION:
+        case XFS_IOC_GETXFLAGS:
+        case XFS_IOC_SETXFLAGS:
+        case XFS_IOC_FSGETXATTR:
+        case XFS_IOC_FSSETXATTR:
+        case XFS_IOC_FSGETXATTRA:
+        case XFS_IOC_FSSETDM:
+        case XFS_IOC_GETBMAP:
+        case XFS_IOC_GETBMAPA:
+        case XFS_IOC_GETBMAPX:
+/* not handled
+        case XFS_IOC_FD_TO_HANDLE:
+        case XFS_IOC_PATH_TO_HANDLE:
+        case XFS_IOC_PATH_TO_HANDLE:
+        case XFS_IOC_PATH_TO_FSHANDLE:
+        case XFS_IOC_OPEN_BY_HANDLE:
+        case XFS_IOC_FSSETDM_BY_HANDLE:
+        case XFS_IOC_READLINK_BY_HANDLE:
+        case XFS_IOC_ATTRLIST_BY_HANDLE:
+        case XFS_IOC_ATTRMULTI_BY_HANDLE:
+*/
+        case XFS_IOC_FSCOUNTS:
+        case XFS_IOC_SET_RESBLKS:
+        case XFS_IOC_GET_RESBLKS:
+        case XFS_IOC_FSGROWFSDATA:
+        case XFS_IOC_FSGROWFSLOG:
+        case XFS_IOC_FSGROWFSRT:
+        case XFS_IOC_FREEZE:
+        case XFS_IOC_THAW:
+        case XFS_IOC_GOINGDOWN:
+        case XFS_IOC_ERROR_INJECTION:
+        case XFS_IOC_ERROR_CLEARALL:
+                break;
+#ifndef BROKEN_X86_ALIGNMENT
+        /* xfs_flock_t and xfs_bstat_t have wrong u32 vs u64 alignment */
+        case XFS_IOC_ALLOCSP:
+        case XFS_IOC_FREESP:
+        case XFS_IOC_RESVSP:
+        case XFS_IOC_UNRESVSP:
+        case XFS_IOC_ALLOCSP64:
+        case XFS_IOC_FREESP64:
+        case XFS_IOC_RESVSP64:
+        case XFS_IOC_UNRESVSP64:
+        case XFS_IOC_SWAPEXT:
+                break;
+        case XFS_IOC_FSBULKSTAT_SINGLE:
+        case XFS_IOC_FSBULKSTAT:
+        case XFS_IOC_FSINUMBERS:
+                arg = xfs_ioctl32_bulkstat(arg);
+                break;
+#endif
+        default:
+                return -ENOIOCTLCMD;
+        }
+        VOP_IOCTL(vp, inode, f, mode, cmd, (void __user *)arg, error);
+        VMODIFY(vp);
+        return error;
+}
+long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg)
+{
+        return __xfs_compat_ioctl(0, f, cmd, arg);
+}
+long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg)
+{
+        return __xfs_compat_ioctl(IO_INVIS, f, cmd, arg);
+}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
new file mode 100644
index 000000000000..779f69a48116
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+long xfs_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg);
+long xfs_compat_invis_ioctl(struct file *f, unsigned cmd, unsigned long arg);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
new file mode 100644
index 000000000000..407e99359391
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include <linux/xattr.h>
+#include <linux/namei.h>
+/*
+ * Pull the link count and size up from the xfs inode to the linux inode
+ */
+STATIC void
+validate_fields(
+        struct inode    *ip)
+{
+        vnode_t         *vp = LINVFS_GET_VP(ip);
+        vattr_t         va;
+        int             error;
+        va.va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS;
+        VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error);
+        if (likely(!error)) {
+                ip->i_nlink = va.va_nlink;
+                ip->i_blocks = va.va_nblocks;
+                /* we're under i_sem so i_size can't change under us */
+                if (i_size_read(ip) != va.va_size)
+                        i_size_write(ip, va.va_size);
+        }
+}
+/*
+ * Determine whether a process has a valid fs_struct (kernel daemons
+ * like knfsd don't have an fs_struct).
+ *
+ * XXX(hch):  nfsd is broken, better fix it instead.
+ */
+STATIC inline int
+has_fs_struct(struct task_struct *task)
+{
+        return (task->fs != init_task.fs);
+}
+STATIC int
+linvfs_mknod(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        int             mode,
+        dev_t           rdev)
+{
+        struct inode    *ip;
+        vattr_t         va;
+        vnode_t         *vp = NULL, *dvp = LINVFS_GET_VP(dir);
+        xfs_acl_t       *default_acl = NULL;
+        attrexists_t    test_default_acl = _ACL_DEFAULT_EXISTS;
+        int             error;
+        /*
+         * Irix uses Missed'em'V split, but doesn't want to see
+         * the upper 5 bits of (14bit) major.
+         */
+        if (!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)
+                return -EINVAL;
+        if (test_default_acl && test_default_acl(dvp)) {
+                if (!_ACL_ALLOC(default_acl))
+                        return -ENOMEM;
+                if (!_ACL_GET_DEFAULT(dvp, default_acl)) {
+                        _ACL_FREE(default_acl);
+                        default_acl = NULL;
+                }
+        }
+        if (IS_POSIXACL(dir) && !default_acl && has_fs_struct(current))
+                mode &= ~current->fs->umask;
+        memset(&va, 0, sizeof(va));
+        va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
+        va.va_type = IFTOVT(mode);
+        va.va_mode = mode;
+        switch (mode & S_IFMT) {
+        case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+                va.va_rdev = sysv_encode_dev(rdev);
+                va.va_mask |= XFS_AT_RDEV;
+                /*FALLTHROUGH*/
+        case S_IFREG:
+                VOP_CREATE(dvp, dentry, &va, &vp, NULL, error);
+                break;
+        case S_IFDIR:
+                VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error);
+                break;
+        default:
+                error = EINVAL;
+                break;
+        }
+        if (default_acl) {
+                if (!error) {
+                        error = _ACL_INHERIT(vp, &va, default_acl);
+                        if (!error) {
+                                VMODIFY(vp);
+                        } else {
+                                struct dentry   teardown = {};
+                                int             err2;
+                                /* Oh, the horror.
+                                 * If we can't add the ACL we must back out.
+                                 * ENOSPC can hit here, among other things.
+                                 */
+                                teardown.d_inode = ip = LINVFS_GET_IP(vp);
+                                teardown.d_name = dentry->d_name;
+                                vn_mark_bad(vp);
+                                
+                                if (S_ISDIR(mode))
+                                        VOP_RMDIR(dvp, &teardown, NULL, err2);
+                                else
+                                        VOP_REMOVE(dvp, &teardown, NULL, err2);
+                                VN_RELE(vp);
+                        }
+                }
+                _ACL_FREE(default_acl);
+        }
+        if (!error) {
+                ASSERT(vp);
+                ip = LINVFS_GET_IP(vp);
+                if (S_ISCHR(mode) || S_ISBLK(mode))
+                        ip->i_rdev = rdev;
+                else if (S_ISDIR(mode))
+                        validate_fields(ip);
+                d_instantiate(dentry, ip);
+                validate_fields(dir);
+        }
+        return -error;
+}
+STATIC int
+linvfs_create(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        int             mode,
+        struct nameidata *nd)
+{
+        return linvfs_mknod(dir, dentry, mode, 0);
+}
+STATIC int
+linvfs_mkdir(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        int             mode)
+{
+        return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0);
+}
+STATIC struct dentry *
+linvfs_lookup(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        struct nameidata *nd)
+{
+        struct vnode    *vp = LINVFS_GET_VP(dir), *cvp;
+        int             error;
+        if (dentry->d_name.len >= MAXNAMELEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
+        if (error) {
+                if (unlikely(error != ENOENT))
+                        return ERR_PTR(-error);
+                d_add(dentry, NULL);
+                return NULL;
+        }
+        return d_splice_alias(LINVFS_GET_IP(cvp), dentry);
+}
+STATIC int
+linvfs_link(
+        struct dentry   *old_dentry,
+        struct inode    *dir,
+        struct dentry   *dentry)
+{
+        struct inode    *ip;    /* inode of guy being linked to */
+        vnode_t         *tdvp;  /* target directory for new name/link */
+        vnode_t         *vp;    /* vp of name being linked */
+        int             error;
+        ip = old_dentry->d_inode;       /* inode being linked to */
+        if (S_ISDIR(ip->i_mode))
+                return -EPERM;
+        tdvp = LINVFS_GET_VP(dir);
+        vp = LINVFS_GET_VP(ip);
+        VOP_LINK(tdvp, vp, dentry, NULL, error);
+        if (!error) {
+                VMODIFY(tdvp);
+                VN_HOLD(vp);
+                validate_fields(ip);
+                d_instantiate(dentry, ip);
+        }
+        return -error;
+}
+STATIC int
+linvfs_unlink(
+        struct inode    *dir,
+        struct dentry   *dentry)
+{
+        struct inode    *inode;
+        vnode_t         *dvp;   /* directory containing name to remove */
+        int             error;
+        inode = dentry->d_inode;
+        dvp = LINVFS_GET_VP(dir);
+        VOP_REMOVE(dvp, dentry, NULL, error);
+        if (!error) {
+                validate_fields(dir);   /* For size only */
+                validate_fields(inode);
+        }
+        return -error;
+}
+STATIC int
+linvfs_symlink(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        const char      *symname)
+{
+        struct inode    *ip;
+        vattr_t         va;
+        vnode_t         *dvp;   /* directory containing name of symlink */
+        vnode_t         *cvp;   /* used to lookup symlink to put in dentry */
+        int             error;
+        dvp = LINVFS_GET_VP(dir);
+        cvp = NULL;
+        memset(&va, 0, sizeof(va));
+        va.va_type = VLNK;
+        va.va_mode = irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO;
+        va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
+        error = 0;
+        VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error);
+        if (!error && cvp) {
+                ASSERT(cvp->v_type == VLNK);
+                ip = LINVFS_GET_IP(cvp);
+                d_instantiate(dentry, ip);
+                validate_fields(dir);
+                validate_fields(ip); /* size needs update */
+        }
+        return -error;
+}
+STATIC int
+linvfs_rmdir(
+        struct inode    *dir,
+        struct dentry   *dentry)
+{
+        struct inode    *inode = dentry->d_inode;
+        vnode_t         *dvp = LINVFS_GET_VP(dir);
+        int             error;
+        VOP_RMDIR(dvp, dentry, NULL, error);
+        if (!error) {
+                validate_fields(inode);
+                validate_fields(dir);
+        }
+        return -error;
+}
+STATIC int
+linvfs_rename(
+        struct inode    *odir,
+        struct dentry   *odentry,
+        struct inode    *ndir,
+        struct dentry   *ndentry)
+{
+        struct inode    *new_inode = ndentry->d_inode;
+        vnode_t         *fvp;   /* from directory */
+        vnode_t         *tvp;   /* target directory */
+        int             error;
+        fvp = LINVFS_GET_VP(odir);
+        tvp = LINVFS_GET_VP(ndir);
+        VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
+        if (error)
+                return -error;
+        if (new_inode)
+                validate_fields(new_inode);
+        validate_fields(odir);
+        if (ndir != odir)
+                validate_fields(ndir);
+        return 0;
+}
+/*
+ * careful here - this function can get called recursively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC int
+linvfs_follow_link(
+        struct dentry           *dentry,
+        struct nameidata        *nd)
+{
+        vnode_t                 *vp;
+        uio_t                   *uio;
+        iovec_t                 iov;
+        int                     error;
+        char                    *link;
+        ASSERT(dentry);
+        ASSERT(nd);
+        link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL);
+        if (!link) {
+                nd_set_link(nd, ERR_PTR(-ENOMEM));
+                return 0;
+        }
+        uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
+        if (!uio) {
+                kfree(link);
+                nd_set_link(nd, ERR_PTR(-ENOMEM));
+                return 0;
+        }
+        vp = LINVFS_GET_VP(dentry->d_inode);
+        iov.iov_base = link;
+        iov.iov_len = MAXNAMELEN;
+        uio->uio_iov = &iov;
+        uio->uio_offset = 0;
+        uio->uio_segflg = UIO_SYSSPACE;
+        uio->uio_resid = MAXNAMELEN;
+        uio->uio_iovcnt = 1;
+        VOP_READLINK(vp, uio, 0, NULL, error);
+        if (error) {
+                kfree(link);
+                link = ERR_PTR(-error);
+        } else {
+                link[MAXNAMELEN - uio->uio_resid] = '\0';
+        }
+        kfree(uio);
+        nd_set_link(nd, link);
+        return 0;
+}
+static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd)
+{
+        char *s = nd_get_link(nd);
+        if (!IS_ERR(s))
+                kfree(s);
+}
+#ifdef CONFIG_XFS_POSIX_ACL
+STATIC int
+linvfs_permission(
+        struct inode    *inode,
+        int             mode,
+        struct nameidata *nd)
+{
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        int             error;
+        mode <<= 6;             /* convert from linux to vnode access bits */
+        VOP_ACCESS(vp, mode, NULL, error);
+        return -error;
+}
+#else
+#define linvfs_permission NULL
+#endif
+STATIC int
+linvfs_getattr(
+        struct vfsmount *mnt,
+        struct dentry   *dentry,
+        struct kstat    *stat)
+{
+        struct inode    *inode = dentry->d_inode;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        int             error = 0;
+        if (unlikely(vp->v_flag & VMODIFIED))
+                error = vn_revalidate(vp);
+        if (!error)
+                generic_fillattr(inode, stat);
+        return 0;
+}
+STATIC int
+linvfs_setattr(
+        struct dentry   *dentry,
+        struct iattr    *attr)
+{
+        struct inode    *inode = dentry->d_inode;
+        unsigned int    ia_valid = attr->ia_valid;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        vattr_t         vattr;
+        int             flags = 0;
+        int             error;
+        memset(&vattr, 0, sizeof(vattr_t));
+        if (ia_valid & ATTR_UID) {
+                vattr.va_mask |= XFS_AT_UID;
+                vattr.va_uid = attr->ia_uid;
+        }
+        if (ia_valid & ATTR_GID) {
+                vattr.va_mask |= XFS_AT_GID;
+                vattr.va_gid = attr->ia_gid;
+        }
+        if (ia_valid & ATTR_SIZE) {
+                vattr.va_mask |= XFS_AT_SIZE;
+                vattr.va_size = attr->ia_size;
+        }
+        if (ia_valid & ATTR_ATIME) {
+                vattr.va_mask |= XFS_AT_ATIME;
+                vattr.va_atime = attr->ia_atime;
+        }
+        if (ia_valid & ATTR_MTIME) {
+                vattr.va_mask |= XFS_AT_MTIME;
+                vattr.va_mtime = attr->ia_mtime;
+        }
+        if (ia_valid & ATTR_CTIME) {
+                vattr.va_mask |= XFS_AT_CTIME;
+                vattr.va_ctime = attr->ia_ctime;
+        }
+        if (ia_valid & ATTR_MODE) {
+                vattr.va_mask |= XFS_AT_MODE;
+                vattr.va_mode = attr->ia_mode;
+                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                        inode->i_mode &= ~S_ISGID;
+        }
+        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
+                flags |= ATTR_UTIME;
+#ifdef ATTR_NO_BLOCK
+        if ((ia_valid & ATTR_NO_BLOCK))
+                flags |= ATTR_NONBLOCK;
+#endif
+        VOP_SETATTR(vp, &vattr, flags, NULL, error);
+        if (error)
+                return -error;
+        vn_revalidate(vp);
+        return error;
+}
+STATIC void
+linvfs_truncate(
+        struct inode    *inode)
+{
+        block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block);
+}
+STATIC int
+linvfs_setxattr(
+        struct dentry   *dentry,
+        const char      *name,
+        const void      *data,
+        size_t          size,
+        int             flags)
+{
+        vnode_t         *vp = LINVFS_GET_VP(dentry->d_inode);
+        char            *attr = (char *)name;
+        attrnames_t     *namesp;
+        int             xflags = 0;
+        int             error;
+        namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
+        if (!namesp)
+                return -EOPNOTSUPP;
+        attr += namesp->attr_namelen;
+        error = namesp->attr_capable(vp, NULL);
+        if (error)
+                return error;
+        /* Convert Linux syscall to XFS internal ATTR flags */
+        if (flags & XATTR_CREATE)
+                xflags |= ATTR_CREATE;
+        if (flags & XATTR_REPLACE)
+                xflags |= ATTR_REPLACE;
+        xflags |= namesp->attr_flag;
+        return namesp->attr_set(vp, attr, (void *)data, size, xflags);
+}
+STATIC ssize_t
+linvfs_getxattr(
+        struct dentry   *dentry,
+        const char      *name,
+        void            *data,
+        size_t          size)
+{
+        vnode_t         *vp = LINVFS_GET_VP(dentry->d_inode);
+        char            *attr = (char *)name;
+        attrnames_t     *namesp;
+        int             xflags = 0;
+        ssize_t         error;
+        namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
+        if (!namesp)
+                return -EOPNOTSUPP;
+        attr += namesp->attr_namelen;
+        error = namesp->attr_capable(vp, NULL);
+        if (error)
+                return error;
+        /* Convert Linux syscall to XFS internal ATTR flags */
+        if (!size) {
+                xflags |= ATTR_KERNOVAL;
+                data = NULL;
+        }
+        xflags |= namesp->attr_flag;
+        return namesp->attr_get(vp, attr, (void *)data, size, xflags);
+}
+STATIC ssize_t
+linvfs_listxattr(
+        struct dentry           *dentry,
+        char                    *data,
+        size_t                  size)
+{
+        vnode_t                 *vp = LINVFS_GET_VP(dentry->d_inode);
+        int                     error, xflags = ATTR_KERNAMELS;
+        ssize_t                 result;
+        if (!size)
+                xflags |= ATTR_KERNOVAL;
+        xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;
+        error = attr_generic_list(vp, data, size, xflags, &result);
+        if (error < 0)
+                return error;
+        return result;
+}
+STATIC int
+linvfs_removexattr(
+        struct dentry   *dentry,
+        const char      *name)
+{
+        vnode_t         *vp = LINVFS_GET_VP(dentry->d_inode);
+        char            *attr = (char *)name;
+        attrnames_t     *namesp;
+        int             xflags = 0;
+        int             error;
+        namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
+        if (!namesp)
+                return -EOPNOTSUPP;
+        attr += namesp->attr_namelen;
+        error = namesp->attr_capable(vp, NULL);
+        if (error)
+                return error;
+        xflags |= namesp->attr_flag;
+        return namesp->attr_remove(vp, attr, xflags);
+}
+struct inode_operations linvfs_file_inode_operations = {
+        .permission             = linvfs_permission,
+        .truncate               = linvfs_truncate,
+        .getattr                = linvfs_getattr,
+        .setattr                = linvfs_setattr,
+        .setxattr               = linvfs_setxattr,
+        .getxattr               = linvfs_getxattr,
+        .listxattr              = linvfs_listxattr,
+        .removexattr            = linvfs_removexattr,
+};
+struct inode_operations linvfs_dir_inode_operations = {
+        .create                 = linvfs_create,
+        .lookup                 = linvfs_lookup,
+        .link                   = linvfs_link,
+        .unlink                 = linvfs_unlink,
+        .symlink                = linvfs_symlink,
+        .mkdir                  = linvfs_mkdir,
+        .rmdir                  = linvfs_rmdir,
+        .mknod                  = linvfs_mknod,
+        .rename                 = linvfs_rename,
+        .permission             = linvfs_permission,
+        .getattr                = linvfs_getattr,
+        .setattr                = linvfs_setattr,
+        .setxattr               = linvfs_setxattr,
+        .getxattr               = linvfs_getxattr,
+        .listxattr              = linvfs_listxattr,
+        .removexattr            = linvfs_removexattr,
+};
+struct inode_operations linvfs_symlink_inode_operations = {
+        .readlink               = generic_readlink,
+        .follow_link            = linvfs_follow_link,
+        .put_link               = linvfs_put_link,
+        .permission             = linvfs_permission,
+        .getattr                = linvfs_getattr,
+        .setattr                = linvfs_setattr,
+        .setxattr               = linvfs_setxattr,
+        .getxattr               = linvfs_getxattr,
+        .listxattr              = linvfs_listxattr,
+        .removexattr            = linvfs_removexattr,
+};
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
new file mode 100644
index 000000000000..6a69a62c36b0
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+extern struct inode_operations linvfs_file_inode_operations;
+extern struct inode_operations linvfs_dir_inode_operations;
+extern struct inode_operations linvfs_symlink_inode_operations;
+extern struct file_operations linvfs_file_operations;
+extern struct file_operations linvfs_invis_file_operations;
+extern struct file_operations linvfs_dir_operations;
+extern struct address_space_operations linvfs_aops;
+extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern void linvfs_unwritten_done(struct buffer_head *, int);
+extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *,
+                        int, unsigned int, void __user *);
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
new file mode 100644
index 000000000000..71bb41019a12
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+#include <linux/types.h>
+#include <linux/config.h>
+/*
+ * Some types are conditional depending on the target system.
+ * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
+ * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
+ * as requiring XFS_BIG_BLKNOS to be set.
+ */
+#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+# define XFS_BIG_BLKNOS 1
+# if BITS_PER_LONG == 64
+#  define XFS_BIG_INUMS 1
+# else
+#  define XFS_BIG_INUMS 0
+# endif
+#else
+# define XFS_BIG_BLKNOS 0
+# define XFS_BIG_INUMS  0
+#endif
+#include <xfs_types.h>
+#include <xfs_arch.h>
+#include <kmem.h>
+#include <mrlock.h>
+#include <spin.h>
+#include <sv.h>
+#include <mutex.h>
+#include <sema.h>
+#include <time.h>
+#include <support/qsort.h>
+#include <support/ktrace.h>
+#include <support/debug.h>
+#include <support/move.h>
+#include <support/uuid.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/major.h>
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/version.h>
+#include <linux/sort.h>
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+#include <xfs_behavior.h>
+#include <xfs_vfs.h>
+#include <xfs_cred.h>
+#include <xfs_vnode.h>
+#include <xfs_stats.h>
+#include <xfs_sysctl.h>
+#include <xfs_iops.h>
+#include <xfs_super.h>
+#include <xfs_globals.h>
+#include <xfs_fs_subr.h>
+#include <xfs_lrw.h>
+#include <xfs_buf.h>
+/*
+ * Feature macros (disable/enable)
+ */
+#undef  HAVE_REFCACHE   /* reference cache not needed for NFS in 2.6 */
+#define HAVE_SENDFILE   /* sendfile(2) exists in 2.6, but not in 2.4 */
+/*
+ * State flag for unwritten extent buffers.
+ *
+ * We need to be able to distinguish between these and delayed
+ * allocate buffers within XFS.  The generic IO path code does
+ * not need to distinguish - we use the BH_Delay flag for both
+ * delalloc and these ondisk-uninitialised buffers.
+ */
+BUFFER_FNS(PrivateStart, unwritten);
+static inline void set_buffer_unwritten_io(struct buffer_head *bh)
+{
+        bh->b_end_io = linvfs_unwritten_done;
+}
+#define restricted_chown        xfs_params.restrict_chown.val
+#define irix_sgid_inherit       xfs_params.sgid_inherit.val
+#define irix_symlink_mode       xfs_params.symlink_mode.val
+#define xfs_panic_mask          xfs_params.panic_mask.val
+#define xfs_error_level         xfs_params.error_level.val
+#define xfs_syncd_centisecs     xfs_params.syncd_timer.val
+#define xfs_stats_clear         xfs_params.stats_clear.val
+#define xfs_inherit_sync        xfs_params.inherit_sync.val
+#define xfs_inherit_nodump      xfs_params.inherit_nodump.val
+#define xfs_inherit_noatime     xfs_params.inherit_noatim.val
+#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
+#define xfs_buf_age_centisecs   xfs_params.xfs_buf_age.val
+#define xfs_inherit_nosymlinks  xfs_params.inherit_nosym.val
+#define xfs_rotorstep           xfs_params.rotorstep.val
+#ifndef __smp_processor_id
+#define __smp_processor_id()    smp_processor_id()
+#endif
+#define current_cpu()           __smp_processor_id()
+#define current_pid()           (current->pid)
+#define current_fsuid(cred)     (current->fsuid)
+#define current_fsgid(cred)     (current->fsgid)
+#define NBPP            PAGE_SIZE
+#define DPPSHFT         (PAGE_SHIFT - 9)
+#define NDPP            (1 << (PAGE_SHIFT - 9))
+#define dtop(DD)        (((DD) + NDPP - 1) >> DPPSHFT)
+#define dtopt(DD)       ((DD) >> DPPSHFT)
+#define dpoff(DD)       ((DD) & (NDPP-1))
+#define NBBY            8               /* number of bits per byte */
+#define NBPC            PAGE_SIZE       /* Number of bytes per click */
+#define BPCSHIFT        PAGE_SHIFT      /* LOG2(NBPC) if exact */
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define BLKDEV_IOSHIFT          BPCSHIFT
+#define BLKDEV_IOSIZE           (1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define BLKDEV_BB               BTOBB(BLKDEV_IOSIZE)
+/* bytes to clicks */
+#define btoc(x)         (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define btoct(x)        ((__psunsigned_t)(x)>>BPCSHIFT)
+#define btoc64(x)       (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define btoct64(x)      ((__uint64_t)(x)>>BPCSHIFT)
+#define io_btoc(x)      (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
+#define io_btoct(x)     ((__psunsigned_t)(x)>>IO_BPCSHIFT)
+/* off_t bytes to clicks */
+#define offtoc(x)       (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define offtoct(x)      ((xfs_off_t)(x)>>BPCSHIFT)
+/* clicks to off_t bytes */
+#define ctooff(x)       ((xfs_off_t)(x)<<BPCSHIFT)
+/* clicks to bytes */
+#define ctob(x)         ((__psunsigned_t)(x)<<BPCSHIFT)
+#define btoct(x)        ((__psunsigned_t)(x)>>BPCSHIFT)
+#define ctob64(x)       ((__uint64_t)(x)<<BPCSHIFT)
+#define io_ctob(x)      ((__psunsigned_t)(x)<<IO_BPCSHIFT)
+/* bytes to clicks */
+#define btoc(x)         (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
+#ifndef CELL_CAPABLE
+#define FSC_NOTIFY_NAME_CHANGED(vp)
+#endif
+#ifndef ENOATTR
+#define ENOATTR         ENODATA         /* Attribute not found */
+#endif
+/* Note: EWRONGFS never visible outside the kernel */
+#define EWRONGFS        EINVAL          /* Mount with wrong filesystem type */
+/*
+ * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
+ *     return codes out of its known range in errno.
+ * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
+ *     conflict with any code we use already or any code a driver may use)
+ * XXX Some options (currently we do #2):
+ *      1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
+ *      2/ 990 ["Unknown error 990"]
+ *      3/ EUCLEAN ["Structure needs cleaning"]
+ *      4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
+ */
+#define EFSCORRUPTED    990             /* Filesystem is corrupted */
+#define SYNCHRONIZE()   barrier()
+#define __return_address __builtin_return_address(0)
+/*
+ * IRIX (BSD) quotactl makes use of separate commands for user/group,
+ * whereas on Linux the syscall encodes this information into the cmd
+ * field (see the QCMD macro in quota.h).  These macros help keep the
+ * code portable - they are not visible from the syscall interface.
+ */
+#define Q_XSETGQLIM     XQM_CMD(0x8)    /* set groups disk limits */
+#define Q_XGETGQUOTA    XQM_CMD(0x9)    /* get groups disk limits */
+/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */
+/* we may well need to fine-tune this if it ever becomes an issue.  */
+#define DQUOT_MAX_HEURISTIC     1024    /* NR_DQUOTS */
+#define ndquot                  DQUOT_MAX_HEURISTIC
+/* IRIX uses the current size of the name cache to guess a good value */
+/* - this isn't the same but is a good enough starting point for now. */
+#define DQUOT_HASH_HEURISTIC    files_stat.nr_files
+/* IRIX inodes maintain the project ID also, zero this field on Linux */
+#define DEFAULT_PROJID  0
+#define dfltprid        DEFAULT_PROJID
+#define MAXPATHLEN      1024
+#define MIN(a,b)        (min(a,b))
+#define MAX(a,b)        (max(a,b))
+#define howmany(x, y)   (((x)+((y)-1))/(y))
+#define roundup(x, y)   ((((x)+((y)-1))/(y))*(y))
+#define xfs_stack_trace()       dump_stack()
+#define xfs_itruncate_data(ip, off)     \
+        (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
+/* Move the kernel do_div definition off to one side */
+#if defined __i386__
+/* For ia32 we need to pull some tricks to get past various versions
+ * of the compiler which do not like us using do_div in the middle
+ * of large functions.
+ */
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+        __u32   mod;
+        switch (n) {
+                case 4:
+                        mod = *(__u32 *)a % b;
+                        *(__u32 *)a = *(__u32 *)a / b;
+                        return mod;
+                case 8:
+                        {
+                        unsigned long __upper, __low, __high, __mod;
+                        __u64   c = *(__u64 *)a;
+                        __upper = __high = c >> 32;
+                        __low = c;
+                        if (__high) {
+                                __upper = __high % (b);
+                                __high = __high / (b);
+                        }
+                        asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+                        asm("":"=A" (c):"a" (__low),"d" (__high));
+                        *(__u64 *)a = c;
+                        return __mod;
+                        }
+        }
+        /* NOTREACHED */
+        return 0;
+}
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+        switch (n) {
+                case 4:
+                        return *(__u32 *)a % b;
+                case 8:
+                        {
+                        unsigned long __upper, __low, __high, __mod;
+                        __u64   c = *(__u64 *)a;
+                        __upper = __high = c >> 32;
+                        __low = c;
+                        if (__high) {
+                                __upper = __high % (b);
+                                __high = __high / (b);
+                        }
+                        asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+                        asm("":"=A" (c):"a" (__low),"d" (__high));
+                        return __mod;
+                        }
+        }
+        /* NOTREACHED */
+        return 0;
+}
+#else
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+        __u32   mod;
+        switch (n) {
+                case 4:
+                        mod = *(__u32 *)a % b;
+                        *(__u32 *)a = *(__u32 *)a / b;
+                        return mod;
+                case 8:
+                        mod = do_div(*(__u64 *)a, b);
+                        return mod;
+        }
+        /* NOTREACHED */
+        return 0;
+}
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+        switch (n) {
+                case 4:
+                        return *(__u32 *)a % b;
+                case 8:
+                        {
+                        __u64   c = *(__u64 *)a;
+                        return do_div(c, b);
+                        }
+        }
+        /* NOTREACHED */
+        return 0;
+}
+#endif
+#undef do_div
+#define do_div(a, b)    xfs_do_div(&(a), (b), sizeof(a))
+#define do_mod(a, b)    xfs_do_mod(&(a), (b), sizeof(a))
+static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+{
+        x += y - 1;
+        do_div(x, y);
+        return(x * y);
+}
+#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL)
+#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
new file mode 100644
index 000000000000..ff145fd0d1a4
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
+ *
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_inode_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_iomap.h"
+#include <linux/capability.h>
+#include <linux/writeback.h>
+#if defined(XFS_RW_TRACE)
+void
+xfs_rw_enter_trace(
+        int                     tag,
+        xfs_iocore_t            *io,
+        void                    *data,
+        size_t                  segs,
+        loff_t                  offset,
+        int                     ioflags)
+{
+        xfs_inode_t     *ip = XFS_IO_INODE(io);
+        if (ip->i_rwtrace == NULL)
+                return;
+        ktrace_enter(ip->i_rwtrace,
+                (void *)(unsigned long)tag,
+                (void *)ip,
+                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+                (void *)data,
+                (void *)((unsigned long)segs),
+                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(offset & 0xffffffff)),
+                (void *)((unsigned long)ioflags),
+                (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL);
+}
+void
+xfs_inval_cached_trace(
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,
+        xfs_off_t       len,
+        xfs_off_t       first,
+        xfs_off_t       last)
+{
+        xfs_inode_t     *ip = XFS_IO_INODE(io);
+        if (ip->i_rwtrace == NULL)
+                return;
+        ktrace_enter(ip->i_rwtrace,
+                (void *)(__psint_t)XFS_INVAL_CACHED,
+                (void *)ip,
+                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(offset & 0xffffffff)),
+                (void *)((unsigned long)((len >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(len & 0xffffffff)),
+                (void *)((unsigned long)((first >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(first & 0xffffffff)),
+                (void *)((unsigned long)((last >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(last & 0xffffffff)),
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL);
+}
+#endif
+/*
+ *      xfs_iozero
+ *
+ *      xfs_iozero clears the specified range of buffer supplied,
+ *      and marks all the affected blocks as valid and modified.  If
+ *      an affected block is not allocated, it will be allocated.  If
+ *      an affected block is not completely overwritten, and is not
+ *      valid before the operation, it will be read from disk before
+ *      being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+        struct inode            *ip,    /* inode                        */
+        loff_t                  pos,    /* offset in file               */
+        size_t                  count,  /* size of data to zero         */
+        loff_t                  end_size)       /* max file size to set */
+{
+        unsigned                bytes;
+        struct page             *page;
+        struct address_space    *mapping;
+        char                    *kaddr;
+        int                     status;
+        mapping = ip->i_mapping;
+        do {
+                unsigned long index, offset;
+                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+                index = pos >> PAGE_CACHE_SHIFT;
+                bytes = PAGE_CACHE_SIZE - offset;
+                if (bytes > count)
+                        bytes = count;
+                status = -ENOMEM;
+                page = grab_cache_page(mapping, index);
+                if (!page)
+                        break;
+                kaddr = kmap(page);
+                status = mapping->a_ops->prepare_write(NULL, page, offset,
+                                                        offset + bytes);
+                if (status) {
+                        goto unlock;
+                }
+                memset((void *) (kaddr + offset), 0, bytes);
+                flush_dcache_page(page);
+                status = mapping->a_ops->commit_write(NULL, page, offset,
+                                                        offset + bytes);
+                if (!status) {
+                        pos += bytes;
+                        count -= bytes;
+                        if (pos > i_size_read(ip))
+                                i_size_write(ip, pos < end_size ? pos : end_size);
+                }
+unlock:
+                kunmap(page);
+                unlock_page(page);
+                page_cache_release(page);
+                if (status)
+                        break;
+        } while (count);
+        return (-status);
+}
+/*
+ * xfs_inval_cached_pages
+ * 
+ * This routine is responsible for keeping direct I/O and buffered I/O
+ * somewhat coherent.  From here we make sure that we're at least
+ * temporarily holding the inode I/O lock exclusively and then call
+ * the page cache to flush and invalidate any cached pages.  If there
+ * are no cached pages this routine will be very quick.
+ */
+void
+xfs_inval_cached_pages(
+        vnode_t         *vp,
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,
+        int             write,
+        int             relock)
+{
+        if (VN_CACHED(vp)) {
+                xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
+                VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
+        }
+}
+ssize_t                 /* bytes read, or (-)  error */
+xfs_read(
+        bhv_desc_t              *bdp,
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned int            segs,
+        loff_t                  *offset,
+        int                     ioflags,
+        cred_t                  *credp)
+{
+        struct file             *file = iocb->ki_filp;
+        struct inode            *inode = file->f_mapping->host;
+        size_t                  size = 0;
+        ssize_t                 ret;
+        xfs_fsize_t             n;
+        xfs_inode_t             *ip;
+        xfs_mount_t             *mp;
+        vnode_t                 *vp;
+        unsigned long           seg;
+        ip = XFS_BHVTOI(bdp);
+        vp = BHV_TO_VNODE(bdp);
+        mp = ip->i_mount;
+        XFS_STATS_INC(xs_read_calls);
+        /* START copy & waste from filemap.c */
+        for (seg = 0; seg < segs; seg++) {
+                const struct iovec *iv = &iovp[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                size += iv->iov_len;
+                if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+                        return XFS_ERROR(-EINVAL);
+        }
+        /* END copy & waste from filemap.c */
+        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_buftarg_t   *target =
+                        (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+                                mp->m_rtdev_targp : mp->m_ddev_targp;
+                if ((*offset & target->pbr_smask) ||
+                    (size & target->pbr_smask)) {
+                        if (*offset == ip->i_d.di_size) {
+                                return (0);
+                        }
+                        return -XFS_ERROR(EINVAL);
+                }
+        }
+        n = XFS_MAXIOFFSET(mp) - *offset;
+        if ((n <= 0) || (size == 0))
+                return 0;
+        if (n < size)
+                size = n;
+        if (XFS_FORCED_SHUTDOWN(mp)) {
+                return -EIO;
+        }
+        if (unlikely(ioflags & IO_ISDIRECT))
+                down(&inode->i_sem);
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
+            !(ioflags & IO_INVIS)) {
+                vrwlock_t locktype = VRWLOCK_READ;
+                ret = -XFS_SEND_DATA(mp, DM_EVENT_READ,
+                                        BHV_TO_VNODE(bdp), *offset, size,
+                                        FILP_DELAY_FLAG(file), &locktype);
+                if (ret) {
+                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                        goto unlock_isem;
+                }
+        }
+        xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
+                                (void *)iovp, segs, *offset, ioflags);
+        ret = __generic_file_aio_read(iocb, iovp, segs, offset);
+        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
+                ret = wait_on_sync_kiocb(iocb);
+        if (ret > 0)
+                XFS_STATS_ADD(xs_read_bytes, ret);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        if (likely(!(ioflags & IO_INVIS)))
+                xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+unlock_isem:
+        if (unlikely(ioflags & IO_ISDIRECT))
+                up(&inode->i_sem);
+        return ret;
+}
+ssize_t
+xfs_sendfile(
+        bhv_desc_t              *bdp,
+        struct file             *filp,
+        loff_t                  *offset,
+        int                     ioflags,
+        size_t                  count,
+        read_actor_t            actor,
+        void                    *target,
+        cred_t                  *credp)
+{
+        ssize_t                 ret;
+        xfs_fsize_t             n;
+        xfs_inode_t             *ip;
+        xfs_mount_t             *mp;
+        vnode_t                 *vp;
+        ip = XFS_BHVTOI(bdp);
+        vp = BHV_TO_VNODE(bdp);
+        mp = ip->i_mount;
+        XFS_STATS_INC(xs_read_calls);
+        n = XFS_MAXIOFFSET(mp) - *offset;
+        if ((n <= 0) || (count == 0))
+                return 0;
+        if (n < count)
+                count = n;
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return -EIO;
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
+            (!(ioflags & IO_INVIS))) {
+                vrwlock_t locktype = VRWLOCK_READ;
+                int error;
+                error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
+                                      FILP_DELAY_FLAG(filp), &locktype);
+                if (error) {
+                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                        return -error;
+                }
+        }
+        xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
+                   (void *)(unsigned long)target, count, *offset, ioflags);
+        ret = generic_file_sendfile(filp, offset, count, actor, target);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        if (ret > 0)
+                XFS_STATS_ADD(xs_read_bytes, ret);
+        if (likely(!(ioflags & IO_INVIS)))
+                xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+        return ret;
+}
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int                              /* error (positive) */
+xfs_zero_last_block(
+        struct inode    *ip,
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,
+        xfs_fsize_t     isize,
+        xfs_fsize_t     end_size)
+{
+        xfs_fileoff_t   last_fsb;
+        xfs_mount_t     *mp;
+        int             nimaps;
+        int             zero_offset;
+        int             zero_len;
+        int             isize_fsb_offset;
+        int             error = 0;
+        xfs_bmbt_irec_t imap;
+        loff_t          loff;
+        size_t          lsize;
+        ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
+        ASSERT(offset > isize);
+        mp = io->io_mount;
+        isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
+        if (isize_fsb_offset == 0) {
+                /*
+                 * There are no extra bytes in the last block on disk to
+                 * zero, so return.
+                 */
+                return 0;
+        }
+        last_fsb = XFS_B_TO_FSBT(mp, isize);
+        nimaps = 1;
+        error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
+                          &nimaps, NULL);
+        if (error) {
+                return error;
+        }
+        ASSERT(nimaps > 0);
+        /*
+         * If the block underlying isize is just a hole, then there
+         * is nothing to zero.
+         */
+        if (imap.br_startblock == HOLESTARTBLOCK) {
+                return 0;
+        }
+        /*
+         * Zero the part of the last block beyond the EOF, and write it
+         * out sync.  We need to drop the ilock while we do this so we
+         * don't deadlock when the buffer cache calls back to us.
+         */
+        XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
+        loff = XFS_FSB_TO_B(mp, last_fsb);
+        lsize = XFS_FSB_TO_B(mp, 1);
+        zero_offset = isize_fsb_offset;
+        zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
+        error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
+        XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+        ASSERT(error >= 0);
+        return error;
+}
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.  This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+int                                     /* error (positive) */
+xfs_zero_eof(
+        vnode_t         *vp,
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,         /* starting I/O offset */
+        xfs_fsize_t     isize,          /* current inode size */
+        xfs_fsize_t     end_size)       /* terminal inode size */
+{
+        struct inode    *ip = LINVFS_GET_IP(vp);
+        xfs_fileoff_t   start_zero_fsb;
+        xfs_fileoff_t   end_zero_fsb;
+        xfs_fileoff_t   prev_zero_fsb;
+        xfs_fileoff_t   zero_count_fsb;
+        xfs_fileoff_t   last_fsb;
+        xfs_extlen_t    buf_len_fsb;
+        xfs_extlen_t    prev_zero_count;
+        xfs_mount_t     *mp;
+        int             nimaps;
+        int             error = 0;
+        xfs_bmbt_irec_t imap;
+        loff_t          loff;
+        size_t          lsize;
+        ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+        ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+        mp = io->io_mount;
+        /*
+         * First handle zeroing the block on which isize resides.
+         * We only zero a part of that block so it is handled specially.
+         */
+        error = xfs_zero_last_block(ip, io, offset, isize, end_size);
+        if (error) {
+                ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+                ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+                return error;
+        }
+        /*
+         * Calculate the range between the new size and the old
+         * where blocks needing to be zeroed may exist.  To get the
+         * block where the last byte in the file currently resides,
+         * we need to subtract one from the size and truncate back
+         * to a block boundary.  We subtract 1 in case the size is
+         * exactly on a block boundary.
+         */
+        last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+        start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+        end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+        ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+        if (last_fsb == end_zero_fsb) {
+                /*
+                 * The size was only incremented on its last block.
+                 * We took care of that above, so just return.
+                 */
+                return 0;
+        }
+        ASSERT(start_zero_fsb <= end_zero_fsb);
+        prev_zero_fsb = NULLFILEOFF;
+        prev_zero_count = 0;
+        while (start_zero_fsb <= end_zero_fsb) {
+                nimaps = 1;
+                zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+                error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
+                                  0, NULL, 0, &imap, &nimaps, NULL);
+                if (error) {
+                        ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+                        ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+                        return error;
+                }
+                ASSERT(nimaps > 0);
+                if (imap.br_state == XFS_EXT_UNWRITTEN ||
+                    imap.br_startblock == HOLESTARTBLOCK) {
+                        /*
+                         * This loop handles initializing pages that were
+                         * partially initialized by the code below this
+                         * loop. It basically zeroes the part of the page
+                         * that sits on a hole and sets the page as P_HOLE
+                         * and calls remapf if it is a mapped file.
+                         */
+                        prev_zero_fsb = NULLFILEOFF;
+                        prev_zero_count = 0;
+                        start_zero_fsb = imap.br_startoff +
+                                         imap.br_blockcount;
+                        ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+                        continue;
+                }
+                /*
+                 * There are blocks in the range requested.
+                 * Zero them a single write at a time.  We actually
+                 * don't zero the entire range returned if it is
+                 * too big and simply loop around to get the rest.
+                 * That is not the most efficient thing to do, but it
+                 * is simple and this path should not be exercised often.
+                 */
+                buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
+                                              mp->m_writeio_blocks << 8);
+                /*
+                 * Drop the inode lock while we're doing the I/O.
+                 * We'll still have the iolock to protect us.
+                 */
+                XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+                loff = XFS_FSB_TO_B(mp, start_zero_fsb);
+                lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
+                error = xfs_iozero(ip, loff, lsize, end_size);
+                if (error) {
+                        goto out_lock;
+                }
+                prev_zero_fsb = start_zero_fsb;
+                prev_zero_count = buf_len_fsb;
+                start_zero_fsb = imap.br_startoff + buf_len_fsb;
+                ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+                XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+        }
+        return 0;
+out_lock:
+        XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+        ASSERT(error >= 0);
+        return error;
+}
+ssize_t                         /* bytes written, or (-) error */
+xfs_write(
+        bhv_desc_t              *bdp,
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned int            nsegs,
+        loff_t                  *offset,
+        int                     ioflags,
+        cred_t                  *credp)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        unsigned long           segs = nsegs;
+        xfs_inode_t             *xip;
+        xfs_mount_t             *mp;
+        ssize_t                 ret = 0, error = 0;
+        xfs_fsize_t             isize, new_size;
+        xfs_iocore_t            *io;
+        vnode_t                 *vp;
+        unsigned long           seg;
+        int                     iolock;
+        int                     eventsent = 0;
+        vrwlock_t               locktype;
+        size_t                  ocount = 0, count;
+        loff_t                  pos;
+        int                     need_isem = 1, need_flush = 0;
+        XFS_STATS_INC(xs_write_calls);
+        vp = BHV_TO_VNODE(bdp);
+        xip = XFS_BHVTOI(bdp);
+        for (seg = 0; seg < segs; seg++) {
+                const struct iovec *iv = &iovp[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                ocount += iv->iov_len;
+                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+                        return -EINVAL;
+                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                        continue;
+                if (seg == 0)
+                        return -EFAULT;
+                segs = seg;
+                ocount -= iv->iov_len;  /* This segment is no good */
+                break;
+        }
+        count = ocount;
+        pos = *offset;
+        if (count == 0)
+                return 0;
+        io = &xip->i_iocore;
+        mp = io->io_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        fs_check_frozen(vp->v_vfsp, SB_FREEZE_WRITE);
+        if (ioflags & IO_ISDIRECT) {
+                xfs_buftarg_t   *target =
+                        (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+                                mp->m_rtdev_targp : mp->m_ddev_targp;
+                if ((pos & target->pbr_smask) || (count & target->pbr_smask))
+                        return XFS_ERROR(-EINVAL);
+                if (!VN_CACHED(vp) && pos < i_size_read(inode))
+                        need_isem = 0;
+                if (VN_CACHED(vp))
+                        need_flush = 1;
+        }
+relock:
+        if (need_isem) {
+                iolock = XFS_IOLOCK_EXCL;
+                locktype = VRWLOCK_WRITE;
+                down(&inode->i_sem);
+        } else {
+                iolock = XFS_IOLOCK_SHARED;
+                locktype = VRWLOCK_WRITE_DIRECT;
+        }
+        xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
+        isize = i_size_read(inode);
+        if (file->f_flags & O_APPEND)
+                *offset = isize;
+start:
+        error = -generic_write_checks(file, &pos, &count,
+                                        S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+                goto out_unlock_isem;
+        }
+        new_size = pos + count;
+        if (new_size > isize)
+                io->io_new_size = new_size;
+        if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
+            !(ioflags & IO_INVIS) && !eventsent)) {
+                loff_t          savedsize = pos;
+                int             dmflags = FILP_DELAY_FLAG(file);
+                if (need_isem)
+                        dmflags |= DM_FLAGS_ISEM;
+                xfs_iunlock(xip, XFS_ILOCK_EXCL);
+                error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
+                                      pos, count,
+                                      dmflags, &locktype);
+                if (error) {
+                        xfs_iunlock(xip, iolock);
+                        goto out_unlock_isem;
+                }
+                xfs_ilock(xip, XFS_ILOCK_EXCL);
+                eventsent = 1;
+                /*
+                 * The iolock was dropped and reaquired in XFS_SEND_DATA
+                 * so we have to recheck the size when appending.
+                 * We will only "goto start;" once, since having sent the
+                 * event prevents another call to XFS_SEND_DATA, which is
+                 * what allows the size to change in the first place.
+                 */
+                if ((file->f_flags & O_APPEND) && savedsize != isize) {
+                        pos = isize = xip->i_d.di_size;
+                        goto start;
+                }
+        }
+        /*
+         * On Linux, generic_file_write updates the times even if
+         * no data is copied in so long as the write had a size.
+         *
+         * We must update xfs' times since revalidate will overcopy xfs.
+         */
+        if (!(ioflags & IO_INVIS)) {
+                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                inode_update_time(inode, 1);
+        }
+        /*
+         * If the offset is beyond the size of the file, we have a couple
+         * of things to do. First, if there is already space allocated
+         * we need to either create holes or zero the disk or ...
+         *
+         * If there is a page where the previous size lands, we need
+         * to zero it out up to the new size.
+         */
+        if (pos > isize) {
+                error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos,
+                                        isize, pos + count);
+                if (error) {
+                        xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+                        goto out_unlock_isem;
+                }
+        }
+        xfs_iunlock(xip, XFS_ILOCK_EXCL);
+        /*
+         * If we're writing the file then make sure to clear the
+         * setuid and setgid bits if the process is not being run
+         * by root.  This keeps people from modifying setuid and
+         * setgid binaries.
+         */
+        if (((xip->i_d.di_mode & S_ISUID) ||
+            ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
+                (S_ISGID | S_IXGRP))) &&
+             !capable(CAP_FSETID)) {
+                error = xfs_write_clear_setuid(xip);
+                if (likely(!error))
+                        error = -remove_suid(file->f_dentry);
+                if (unlikely(error)) {
+                        xfs_iunlock(xip, iolock);
+                        goto out_unlock_isem;
+                }
+        }
+retry:
+        /* We can write back this queue in page reclaim */
+        current->backing_dev_info = mapping->backing_dev_info;
+        if ((ioflags & IO_ISDIRECT)) {
+                if (need_flush) {
+                        xfs_inval_cached_trace(io, pos, -1,
+                                        ctooff(offtoct(pos)), -1);
+                        VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(pos)),
+                                        -1, FI_REMAPF_LOCKED);
+                }
+                if (need_isem) {
+                        /* demote the lock now the cached pages are gone */
+                        XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
+                        up(&inode->i_sem);
+                        iolock = XFS_IOLOCK_SHARED;
+                        locktype = VRWLOCK_WRITE_DIRECT;
+                        need_isem = 0;
+                }
+                xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs,
+                                *offset, ioflags);
+                ret = generic_file_direct_write(iocb, iovp,
+                                &segs, pos, offset, count, ocount);
+                /*
+                 * direct-io write to a hole: fall through to buffered I/O
+                 * for completing the rest of the request.
+                 */
+                if (ret >= 0 && ret != count) {
+                        XFS_STATS_ADD(xs_write_bytes, ret);
+                        pos += ret;
+                        count -= ret;
+                        need_isem = 1;
+                        ioflags &= ~IO_ISDIRECT;
+                        xfs_iunlock(xip, iolock);
+                        goto relock;
+                }
+        } else {
+                xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs,
+                                *offset, ioflags);
+                ret = generic_file_buffered_write(iocb, iovp, segs,
+                                pos, offset, count, ret);
+        }
+        current->backing_dev_info = NULL;
+        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
+                ret = wait_on_sync_kiocb(iocb);
+        if ((ret == -ENOSPC) &&
+            DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
+            !(ioflags & IO_INVIS)) {
+                xfs_rwunlock(bdp, locktype);
+                error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
+                                DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
+                                0, 0, 0); /* Delay flag intentionally  unused */
+                if (error)
+                        goto out_unlock_isem;
+                xfs_rwlock(bdp, locktype);
+                pos = xip->i_d.di_size;
+                ret = 0;
+                goto retry;
+        }
+        if (*offset > xip->i_d.di_size) {
+                xfs_ilock(xip, XFS_ILOCK_EXCL);
+                if (*offset > xip->i_d.di_size) {
+                        xip->i_d.di_size = *offset;
+                        i_size_write(inode, *offset);
+                        xip->i_update_core = 1;
+                        xip->i_update_size = 1;
+                }
+                xfs_iunlock(xip, XFS_ILOCK_EXCL);
+        }
+        error = -ret;
+        if (ret <= 0)
+                goto out_unlock_internal;
+        XFS_STATS_ADD(xs_write_bytes, ret);
+        /* Handle various SYNC-type writes */
+        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                /*
+                 * If we're treating this as O_DSYNC and we have not updated the
+                 * size, force the log.
+                 */
+                if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
+                    !(xip->i_update_size)) {
+                        xfs_inode_log_item_t    *iip = xip->i_itemp;
+                        /*
+                         * If an allocation transaction occurred
+                         * without extending the size, then we have to force
+                         * the log up the proper point to ensure that the
+                         * allocation is permanent.  We can't count on
+                         * the fact that buffered writes lock out direct I/O
+                         * writes - the direct I/O write could have extended
+                         * the size nontransactionally, then finished before
+                         * we started.  xfs_write_file will think that the file
+                         * didn't grow but the update isn't safe unless the
+                         * size change is logged.
+                         *
+                         * Force the log if we've committed a transaction
+                         * against the inode or if someone else has and
+                         * the commit record hasn't gone to disk (e.g.
+                         * the inode is pinned).  This guarantees that
+                         * all changes affecting the inode are permanent
+                         * when we return.
+                         */
+                        if (iip && iip->ili_last_lsn) {
+                                xfs_log_force(mp, iip->ili_last_lsn,
+                                                XFS_LOG_FORCE | XFS_LOG_SYNC);
+                        } else if (xfs_ipincount(xip) > 0) {
+                                xfs_log_force(mp, (xfs_lsn_t)0,
+                                                XFS_LOG_FORCE | XFS_LOG_SYNC);
+                        }
+                } else {
+                        xfs_trans_t     *tp;
+                        /*
+                         * O_SYNC or O_DSYNC _with_ a size update are handled
+                         * the same way.
+                         *
+                         * If the write was synchronous then we need to make
+                         * sure that the inode modification time is permanent.
+                         * We'll have updated the timestamp above, so here
+                         * we use a synchronous transaction to log the inode.
+                         * It's not fast, but it's necessary.
+                         *
+                         * If this a dsync write and the size got changed
+                         * non-transactionally, then we need to ensure that
+                         * the size change gets logged in a synchronous
+                         * transaction.
+                         */
+                        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+                        if ((error = xfs_trans_reserve(tp, 0,
+                                                      XFS_SWRITE_LOG_RES(mp),
+                                                      0, 0, 0))) {
+                                /* Transaction reserve failed */
+                                xfs_trans_cancel(tp, 0);
+                        } else {
+                                /* Transaction reserve successful */
+                                xfs_ilock(xip, XFS_ILOCK_EXCL);
+                                xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
+                                xfs_trans_ihold(tp, xip);
+                                xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
+                                xfs_trans_set_sync(tp);
+                                error = xfs_trans_commit(tp, 0, NULL);
+                                xfs_iunlock(xip, XFS_ILOCK_EXCL);
+                        }
+                        if (error)
+                                goto out_unlock_internal;
+                }
+        
+                xfs_rwunlock(bdp, locktype);
+                if (need_isem)
+                        up(&inode->i_sem);
+                error = sync_page_range(inode, mapping, pos, ret);
+                if (!error)
+                        error = ret;
+                return error;
+        }
+ out_unlock_internal:
+        xfs_rwunlock(bdp, locktype);
+ out_unlock_isem:
+        if (need_isem)
+                up(&inode->i_sem);
+        return -error;
+}
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(struct xfs_buf *bp)
+{
+        xfs_mount_t     *mp;
+        mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
+        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                pagebuf_iorequest(bp);
+                return 0;
+        } else {
+                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
+                /*
+                 * Metadata write that didn't get logged but
+                 * written delayed anyway. These aren't associated
+                 * with a transaction, and can be ignored.
+                 */
+                if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
+                    (XFS_BUF_ISREAD(bp)) == 0)
+                        return (xfs_bioerror_relse(bp));
+                else
+                        return (xfs_bioerror(bp));
+        }
+}
+int
+xfs_bmap(bhv_desc_t     *bdp,
+        xfs_off_t       offset,
+        ssize_t         count,
+        int             flags,
+        xfs_iomap_t     *iomapp,
+        int             *niomaps)
+{
+        xfs_inode_t     *ip = XFS_BHVTOI(bdp);
+        xfs_iocore_t    *io = &ip->i_iocore;
+        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
+        ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+               ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
+        return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
+}
+/*
+ * Wrapper around bdstrat so that we can stop data
+ * from going to disk in case we are shutting down the filesystem.
+ * Typically user data goes thru this path; one of the exceptions
+ * is the superblock.
+ */
+int
+xfsbdstrat(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp)
+{
+        ASSERT(mp);
+        if (!XFS_FORCED_SHUTDOWN(mp)) {
+                /* Grio redirection would go here
+                 * if (XFS_BUF_IS_GRIO(bp)) {
+                 */
+                pagebuf_iorequest(bp);
+                return 0;
+        }
+        xfs_buftrace("XFSBDSTRAT IOERROR", bp);
+        return (xfs_bioerror_relse(bp));
+}
+/*
+ * If the underlying (data/log/rt) device is readonly, there are some
+ * operations that cannot proceed.
+ */
+int
+xfs_dev_is_read_only(
+        xfs_mount_t             *mp,
+        char                    *message)
+{
+        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
+            xfs_readonly_buftarg(mp->m_logdev_targp) ||
+            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
+                cmn_err(CE_NOTE,
+                        "XFS: %s required on read-only device.", message);
+                cmn_err(CE_NOTE,
+                        "XFS: write access unavailable, cannot proceed.");
+                return EROFS;
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
new file mode 100644
index 000000000000..d723e35254a0
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LRW_H__
+#define __XFS_LRW_H__
+struct vnode;
+struct bhv_desc;
+struct xfs_mount;
+struct xfs_iocore;
+struct xfs_inode;
+struct xfs_bmbt_irec;
+struct xfs_buf;
+struct xfs_iomap;
+#if defined(XFS_RW_TRACE)
+/*
+ * Defines for the trace mechanisms in xfs_lrw.c.
+ */
+#define XFS_RW_KTRACE_SIZE      128
+#define XFS_READ_ENTER          1
+#define XFS_WRITE_ENTER         2
+#define XFS_IOMAP_READ_ENTER    3
+#define XFS_IOMAP_WRITE_ENTER   4
+#define XFS_IOMAP_READ_MAP      5
+#define XFS_IOMAP_WRITE_MAP     6
+#define XFS_IOMAP_WRITE_NOSPACE 7
+#define XFS_ITRUNC_START        8
+#define XFS_ITRUNC_FINISH1      9
+#define XFS_ITRUNC_FINISH2      10
+#define XFS_CTRUNC1             11
+#define XFS_CTRUNC2             12
+#define XFS_CTRUNC3             13
+#define XFS_CTRUNC4             14
+#define XFS_CTRUNC5             15
+#define XFS_CTRUNC6             16
+#define XFS_BUNMAPI             17
+#define XFS_INVAL_CACHED        18
+#define XFS_DIORD_ENTER         19
+#define XFS_DIOWR_ENTER         20
+#define XFS_SENDFILE_ENTER      21
+#define XFS_WRITEPAGE_ENTER     22
+#define XFS_RELEASEPAGE_ENTER   23
+#define XFS_IOMAP_ALLOC_ENTER   24
+#define XFS_IOMAP_ALLOC_MAP     25
+#define XFS_IOMAP_UNWRITTEN     26
+extern void xfs_rw_enter_trace(int, struct xfs_iocore *,
+                                void *, size_t, loff_t, int);
+extern void xfs_inval_cached_trace(struct xfs_iocore *,
+                                xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t);
+#else
+#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags)
+#define xfs_inval_cached_trace(io, offset, len, first, last)
+#endif
+/*
+ * Maximum count of bmaps used by read and write paths.
+ */
+#define XFS_MAX_RW_NBMAPS       4
+extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int,
+                        struct xfs_iomap *, int *);
+extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb(struct xfs_buf *);
+extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t,
+                                xfs_fsize_t, xfs_fsize_t);
+extern void xfs_inval_cached_pages(struct vnode *, struct xfs_iocore *,
+                                xfs_off_t, int, int);
+extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *,
+                                const struct iovec *, unsigned int,
+                                loff_t *, int, struct cred *);
+extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
+                                const struct iovec *, unsigned int,
+                                loff_t *, int, struct cred *);
+extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
+                                loff_t *, int, size_t, read_actor_t,
+                                void *, struct cred *);
+extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
+#define XFS_FSB_TO_DB_IO(io,fsb) \
+                (((io)->io_flags & XFS_IOCORE_RT) ? \
+                 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
+                 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
+#endif  /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
new file mode 100644
index 000000000000..aaf5ddba47f3
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include <linux/proc_fs.h>
+DEFINE_PER_CPU(struct xfsstats, xfsstats);
+STATIC int
+xfs_read_xfsstats(
+        char            *buffer,
+        char            **start,
+        off_t           offset,
+        int             count,
+        int             *eof,
+        void            *data)
+{
+        int             c, i, j, len, val;
+        __uint64_t      xs_xstrat_bytes = 0;
+        __uint64_t      xs_write_bytes = 0;
+        __uint64_t      xs_read_bytes = 0;
+        static struct xstats_entry {
+                char    *desc;
+                int     endpoint;
+        } xstats[] = {
+                { "extent_alloc",       XFSSTAT_END_EXTENT_ALLOC        },
+                { "abt",                XFSSTAT_END_ALLOC_BTREE         },
+                { "blk_map",            XFSSTAT_END_BLOCK_MAPPING       },
+                { "bmbt",               XFSSTAT_END_BLOCK_MAP_BTREE     },
+                { "dir",                XFSSTAT_END_DIRECTORY_OPS       },
+                { "trans",              XFSSTAT_END_TRANSACTIONS        },
+                { "ig",                 XFSSTAT_END_INODE_OPS           },
+                { "log",                XFSSTAT_END_LOG_OPS             },
+                { "push_ail",           XFSSTAT_END_TAIL_PUSHING        },
+                { "xstrat",             XFSSTAT_END_WRITE_CONVERT       },
+                { "rw",                 XFSSTAT_END_READ_WRITE_OPS      },
+                { "attr",               XFSSTAT_END_ATTRIBUTE_OPS       },
+                { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
+                { "vnodes",             XFSSTAT_END_VNODE_OPS           },
+                { "buf",                XFSSTAT_END_BUF                 },
+        };
+        /* Loop over all stats groups */
+        for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) {
+                len += sprintf(buffer + len, xstats[i].desc);
+                /* inner loop does each group */
+                while (j < xstats[i].endpoint) {
+                        val = 0;
+                        /* sum over all cpus */
+                        for (c = 0; c < NR_CPUS; c++) {
+                                if (!cpu_possible(c)) continue;
+                                val += *(((__u32*)&per_cpu(xfsstats, c) + j));
+                        }
+                        len += sprintf(buffer + len, " %u", val);
+                        j++;
+                }
+                buffer[len++] = '\n';
+        }
+        /* extra precision counters */
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i)) continue;
+                xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
+                xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
+                xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+        }
+        len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+                        xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+        len += sprintf(buffer + len, "debug %u\n",
+#if defined(DEBUG)
+                1);
+#else
+                0);
+#endif
+        if (offset >= len) {
+                *start = buffer;
+                *eof = 1;
+                return 0;
+        }
+        *start = buffer + offset;
+        if ((len -= offset) > count)
+                return count;
+        *eof = 1;
+        return len;
+}
+void
+xfs_init_procfs(void)
+{
+        if (!proc_mkdir("fs/xfs", NULL))
+                return;
+        create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL);
+}
+void
+xfs_cleanup_procfs(void)
+{
+        remove_proc_entry("fs/xfs/stat", NULL);
+        remove_proc_entry("fs/xfs", NULL);
+}
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
new file mode 100644
index 000000000000..3f756a6c3eb0
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+#include <linux/percpu.h>
+/*
+ * XFS global statistics
+ */
+struct xfsstats {
+# define XFSSTAT_END_EXTENT_ALLOC       4
+        __uint32_t              xs_allocx;
+        __uint32_t              xs_allocb;
+        __uint32_t              xs_freex;
+        __uint32_t              xs_freeb;
+# define XFSSTAT_END_ALLOC_BTREE        (XFSSTAT_END_EXTENT_ALLOC+4)
+        __uint32_t              xs_abt_lookup;
+        __uint32_t              xs_abt_compare;
+        __uint32_t              xs_abt_insrec;
+        __uint32_t              xs_abt_delrec;
+# define XFSSTAT_END_BLOCK_MAPPING      (XFSSTAT_END_ALLOC_BTREE+7)
+        __uint32_t              xs_blk_mapr;
+        __uint32_t              xs_blk_mapw;
+        __uint32_t              xs_blk_unmap;
+        __uint32_t              xs_add_exlist;
+        __uint32_t              xs_del_exlist;
+        __uint32_t              xs_look_exlist;
+        __uint32_t              xs_cmp_exlist;
+# define XFSSTAT_END_BLOCK_MAP_BTREE    (XFSSTAT_END_BLOCK_MAPPING+4)
+        __uint32_t              xs_bmbt_lookup;
+        __uint32_t              xs_bmbt_compare;
+        __uint32_t              xs_bmbt_insrec;
+        __uint32_t              xs_bmbt_delrec;
+# define XFSSTAT_END_DIRECTORY_OPS      (XFSSTAT_END_BLOCK_MAP_BTREE+4)
+        __uint32_t              xs_dir_lookup;
+        __uint32_t              xs_dir_create;
+        __uint32_t              xs_dir_remove;
+        __uint32_t              xs_dir_getdents;
+# define XFSSTAT_END_TRANSACTIONS       (XFSSTAT_END_DIRECTORY_OPS+3)
+        __uint32_t              xs_trans_sync;
+        __uint32_t              xs_trans_async;
+        __uint32_t              xs_trans_empty;
+# define XFSSTAT_END_INODE_OPS          (XFSSTAT_END_TRANSACTIONS+7)
+        __uint32_t              xs_ig_attempts;
+        __uint32_t              xs_ig_found;
+        __uint32_t              xs_ig_frecycle;
+        __uint32_t              xs_ig_missed;
+        __uint32_t              xs_ig_dup;
+        __uint32_t              xs_ig_reclaims;
+        __uint32_t              xs_ig_attrchg;
+# define XFSSTAT_END_LOG_OPS            (XFSSTAT_END_INODE_OPS+5)
+        __uint32_t              xs_log_writes;
+        __uint32_t              xs_log_blocks;
+        __uint32_t              xs_log_noiclogs;
+        __uint32_t              xs_log_force;
+        __uint32_t              xs_log_force_sleep;
+# define XFSSTAT_END_TAIL_PUSHING       (XFSSTAT_END_LOG_OPS+10)
+        __uint32_t              xs_try_logspace;
+        __uint32_t              xs_sleep_logspace;
+        __uint32_t              xs_push_ail;
+        __uint32_t              xs_push_ail_success;
+        __uint32_t              xs_push_ail_pushbuf;
+        __uint32_t              xs_push_ail_pinned;
+        __uint32_t              xs_push_ail_locked;
+        __uint32_t              xs_push_ail_flushing;
+        __uint32_t              xs_push_ail_restarts;
+        __uint32_t              xs_push_ail_flush;
+# define XFSSTAT_END_WRITE_CONVERT      (XFSSTAT_END_TAIL_PUSHING+2)
+        __uint32_t              xs_xstrat_quick;
+        __uint32_t              xs_xstrat_split;
+# define XFSSTAT_END_READ_WRITE_OPS     (XFSSTAT_END_WRITE_CONVERT+2)
+        __uint32_t              xs_write_calls;
+        __uint32_t              xs_read_calls;
+# define XFSSTAT_END_ATTRIBUTE_OPS      (XFSSTAT_END_READ_WRITE_OPS+4)
+        __uint32_t              xs_attr_get;
+        __uint32_t              xs_attr_set;
+        __uint32_t              xs_attr_remove;
+        __uint32_t              xs_attr_list;
+# define XFSSTAT_END_INODE_CLUSTER      (XFSSTAT_END_ATTRIBUTE_OPS+3)
+        __uint32_t              xs_iflush_count;
+        __uint32_t              xs_icluster_flushcnt;
+        __uint32_t              xs_icluster_flushinode;
+# define XFSSTAT_END_VNODE_OPS          (XFSSTAT_END_INODE_CLUSTER+8)
+        __uint32_t              vn_active;      /* # vnodes not on free lists */
+        __uint32_t              vn_alloc;       /* # times vn_alloc called */
+        __uint32_t              vn_get;         /* # times vn_get called */
+        __uint32_t              vn_hold;        /* # times vn_hold called */
+        __uint32_t              vn_rele;        /* # times vn_rele called */
+        __uint32_t              vn_reclaim;     /* # times vn_reclaim called */
+        __uint32_t              vn_remove;      /* # times vn_remove called */
+        __uint32_t              vn_free;        /* # times vn_free called */
+#define XFSSTAT_END_BUF                 (XFSSTAT_END_VNODE_OPS+9)
+        __uint32_t              pb_get;
+        __uint32_t              pb_create;
+        __uint32_t              pb_get_locked;
+        __uint32_t              pb_get_locked_waited;
+        __uint32_t              pb_busy_locked;
+        __uint32_t              pb_miss_locked;
+        __uint32_t              pb_page_retries;
+        __uint32_t              pb_page_found;
+        __uint32_t              pb_get_read;
+/* Extra precision counters */
+        __uint64_t              xs_xstrat_bytes;
+        __uint64_t              xs_write_bytes;
+        __uint64_t              xs_read_bytes;
+};
+DECLARE_PER_CPU(struct xfsstats, xfsstats);
+/*
+ * We don't disable preempt, not too worried about poking the
+ * wrong CPU's stat for now (also aggregated before reporting).
+ */
+#define XFS_STATS_INC(v)        (per_cpu(xfsstats, current_cpu()).v++)
+#define XFS_STATS_DEC(v)        (per_cpu(xfsstats, current_cpu()).v--)
+#define XFS_STATS_ADD(v, inc)   (per_cpu(xfsstats, current_cpu()).v += (inc))
+extern void xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+#else   /* !CONFIG_PROC_FS */
+# define XFS_STATS_INC(count)
+# define XFS_STATS_DEC(count)
+# define XFS_STATS_ADD(count, inc)
+static __inline void xfs_init_procfs(void) { };
+static __inline void xfs_cleanup_procfs(void) { };
+#endif  /* !CONFIG_PROC_FS */
+#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
new file mode 100644
index 000000000000..53dc658cafa6
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -0,0 +1,912 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_clnt.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_version.h"
+#include "xfs_ioctl32.h"
+#include <linux/namei.h>
+#include <linux/init.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+STATIC struct quotactl_ops linvfs_qops;
+STATIC struct super_operations linvfs_sops;
+STATIC kmem_zone_t *linvfs_inode_zone;
+STATIC struct xfs_mount_args *
+xfs_args_allocate(
+        struct super_block      *sb)
+{
+        struct xfs_mount_args   *args;
+        args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP);
+        args->logbufs = args->logbufsize = -1;
+        strncpy(args->fsname, sb->s_id, MAXNAMELEN);
+        /* Copy the already-parsed mount(2) flags we're interested in */
+        if (sb->s_flags & MS_NOATIME)
+                args->flags |= XFSMNT_NOATIME;
+        if (sb->s_flags & MS_DIRSYNC)
+                args->flags |= XFSMNT_DIRSYNC;
+        if (sb->s_flags & MS_SYNCHRONOUS)
+                args->flags |= XFSMNT_WSYNC;
+        /* Default to 32 bit inodes on Linux all the time */
+        args->flags |= XFSMNT_32BITINODES;
+        return args;
+}
+__uint64_t
+xfs_max_file_offset(
+        unsigned int            blockshift)
+{
+        unsigned int            pagefactor = 1;
+        unsigned int            bitshift = BITS_PER_LONG - 1;
+        /* Figure out maximum filesize, on Linux this can depend on
+         * the filesystem blocksize (on 32 bit platforms).
+         * __block_prepare_write does this in an [unsigned] long...
+         *      page->index << (PAGE_CACHE_SHIFT - bbits)
+         * So, for page sized blocks (4K on 32 bit platforms),
+         * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+         *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+         * but for smaller blocksizes it is less (bbits = log2 bsize).
+         * Note1: get_block_t takes a long (implicit cast from above)
+         * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+         * can optionally convert the [unsigned] long from above into
+         * an [unsigned] long long.
+         */
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBD)
+        ASSERT(sizeof(sector_t) == 8);
+        pagefactor = PAGE_CACHE_SIZE;
+        bitshift = BITS_PER_LONG;
+# else
+        pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+        return (((__uint64_t)pagefactor) << bitshift) - 1;
+}
+STATIC __inline__ void
+xfs_set_inodeops(
+        struct inode            *inode)
+{
+        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        if (vp->v_type == VNON) {
+                vn_mark_bad(vp);
+        } else if (S_ISREG(inode->i_mode)) {
+                inode->i_op = &linvfs_file_inode_operations;
+                inode->i_fop = &linvfs_file_operations;
+                inode->i_mapping->a_ops = &linvfs_aops;
+        } else if (S_ISDIR(inode->i_mode)) {
+                inode->i_op = &linvfs_dir_inode_operations;
+                inode->i_fop = &linvfs_dir_operations;
+        } else if (S_ISLNK(inode->i_mode)) {
+                inode->i_op = &linvfs_symlink_inode_operations;
+                if (inode->i_blocks)
+                        inode->i_mapping->a_ops = &linvfs_aops;
+        } else {
+                inode->i_op = &linvfs_file_inode_operations;
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+        }
+}
+STATIC __inline__ void
+xfs_revalidate_inode(
+        xfs_mount_t             *mp,
+        vnode_t                 *vp,
+        xfs_inode_t             *ip)
+{
+        struct inode            *inode = LINVFS_GET_IP(vp);
+        inode->i_mode   = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type);
+        inode->i_nlink  = ip->i_d.di_nlink;
+        inode->i_uid    = ip->i_d.di_uid;
+        inode->i_gid    = ip->i_d.di_gid;
+        if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+                inode->i_rdev = 0;
+        } else {
+                xfs_dev_t dev = ip->i_df.if_u2.if_rdev;
+                inode->i_rdev = MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
+        }
+        inode->i_blksize = PAGE_CACHE_SIZE;
+        inode->i_generation = ip->i_d.di_gen;
+        i_size_write(inode, ip->i_d.di_size);
+        inode->i_blocks =
+                XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+        inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
+        inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
+        inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
+        inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
+        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
+        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
+        if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+        if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+                inode->i_flags |= S_SYNC;
+        else
+                inode->i_flags &= ~S_SYNC;
+        if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        else
+                inode->i_flags &= ~S_NOATIME;
+        vp->v_flag &= ~VMODIFIED;
+}
+void
+xfs_initialize_vnode(
+        bhv_desc_t              *bdp,
+        vnode_t                 *vp,
+        bhv_desc_t              *inode_bhv,
+        int                     unlock)
+{
+        xfs_inode_t             *ip = XFS_BHVTOI(inode_bhv);
+        struct inode            *inode = LINVFS_GET_IP(vp);
+        if (!inode_bhv->bd_vobj) {
+                vp->v_vfsp = bhvtovfs(bdp);
+                bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops);
+                bhv_insert(VN_BHV_HEAD(vp), inode_bhv);
+        }
+        /*
+         * We need to set the ops vectors, and unlock the inode, but if
+         * we have been called during the new inode create process, it is
+         * too early to fill in the Linux inode.  We will get called a
+         * second time once the inode is properly set up, and then we can
+         * finish our work.
+         */
+        if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) {
+                vp->v_type = IFTOVT(ip->i_d.di_mode);
+                xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
+                xfs_set_inodeops(inode);
+        
+                ip->i_flags &= ~XFS_INEW;
+                barrier();
+                unlock_new_inode(inode);
+        }
+}
+int
+xfs_blkdev_get(
+        xfs_mount_t             *mp,
+        const char              *name,
+        struct block_device     **bdevp)
+{
+        int                     error = 0;
+        *bdevp = open_bdev_excl(name, 0, mp);
+        if (IS_ERR(*bdevp)) {
+                error = PTR_ERR(*bdevp);
+                printk("XFS: Invalid device [%s], error=%d\n", name, error);
+        }
+        return -error;
+}
+void
+xfs_blkdev_put(
+        struct block_device     *bdev)
+{
+        if (bdev)
+                close_bdev_excl(bdev);
+}
+STATIC struct inode *
+linvfs_alloc_inode(
+        struct super_block      *sb)
+{
+        vnode_t                 *vp;
+        vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_zone, 
+                kmem_flags_convert(KM_SLEEP));
+        if (!vp)
+                return NULL;
+        return LINVFS_GET_IP(vp);
+}
+STATIC void
+linvfs_destroy_inode(
+        struct inode            *inode)
+{
+        kmem_cache_free(linvfs_inode_zone, LINVFS_GET_VP(inode));
+}
+STATIC void
+init_once(
+        void                    *data,
+        kmem_cache_t            *cachep,
+        unsigned long           flags)
+{
+        vnode_t                 *vp = (vnode_t *)data;
+        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+            SLAB_CTOR_CONSTRUCTOR)
+                inode_init_once(LINVFS_GET_IP(vp));
+}
+STATIC int
+init_inodecache( void )
+{
+        linvfs_inode_zone = kmem_cache_create("linvfs_icache",
+                                sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT,
+                                init_once, NULL);
+        if (linvfs_inode_zone == NULL)
+                return -ENOMEM;
+        return 0;
+}
+STATIC void
+destroy_inodecache( void )
+{
+        if (kmem_cache_destroy(linvfs_inode_zone))
+                printk(KERN_WARNING "%s: cache still in use!\n", __FUNCTION__);
+}
+/*
+ * Attempt to flush the inode, this will actually fail
+ * if the inode is pinned, but we dirty the inode again
+ * at the point when it is unpinned after a log write,
+ * since this is when the inode itself becomes flushable. 
+ */
+STATIC int
+linvfs_write_inode(
+        struct inode            *inode,
+        int                     sync)
+{
+        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        int                     error = 0, flags = FLUSH_INODE;
+        if (vp) {
+                vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+                if (sync)
+                        flags |= FLUSH_SYNC;
+                VOP_IFLUSH(vp, flags, error);
+                if (error == EAGAIN) {
+                        if (sync)
+                                VOP_IFLUSH(vp, flags | FLUSH_LOG, error);
+                        else
+                                error = 0;
+                }
+        }
+        return -error;
+}
+STATIC void
+linvfs_clear_inode(
+        struct inode            *inode)
+{
+        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        if (vp) {
+                vn_rele(vp);
+                vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+                /*
+                 * Do all our cleanup, and remove this vnode.
+                 */
+                vn_remove(vp);
+        }
+}
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+        struct vfs      *vfs,
+        void            *data,
+        void            (*syncer)(vfs_t *, void *))
+{
+        vfs_sync_work_t *work;
+        work = kmem_alloc(sizeof(struct vfs_sync_work), KM_SLEEP);
+        INIT_LIST_HEAD(&work->w_list);
+        work->w_syncer = syncer;
+        work->w_data = data;
+        work->w_vfs = vfs;
+        spin_lock(&vfs->vfs_sync_lock);
+        list_add_tail(&work->w_list, &vfs->vfs_sync_list);
+        spin_unlock(&vfs->vfs_sync_lock);
+        wake_up_process(vfs->vfs_sync_task);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+        vfs_t           *vfs,
+        void            *inode)
+{
+        filemap_flush(((struct inode *)inode)->i_mapping);
+        iput((struct inode *)inode);
+}
+void
+xfs_flush_inode(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = LINVFS_GET_IP(XFS_ITOV(ip));
+        struct vfs      *vfs = XFS_MTOVFS(ip->i_mount);
+        igrab(inode);
+        xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work);
+        delay(HZ/2);
+}
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+        vfs_t           *vfs,
+        void            *inode)
+{
+        sync_blockdev(vfs->vfs_super->s_bdev);
+        iput((struct inode *)inode);
+}
+void
+xfs_flush_device(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = LINVFS_GET_IP(XFS_ITOV(ip));
+        struct vfs      *vfs = XFS_MTOVFS(ip->i_mount);
+        igrab(inode);
+        xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work);
+        delay(HZ/2);
+        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
+#define SYNCD_FLAGS     (SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR)
+STATIC void
+vfs_sync_worker(
+        vfs_t           *vfsp,
+        void            *unused)
+{
+        int             error;
+        if (!(vfsp->vfs_flag & VFS_RDONLY))
+                VFS_SYNC(vfsp, SYNCD_FLAGS, NULL, error);
+        vfsp->vfs_sync_seq++;
+        wmb();
+        wake_up(&vfsp->vfs_wait_single_sync_task);
+}
+STATIC int
+xfssyncd(
+        void                    *arg)
+{
+        long                    timeleft;
+        vfs_t                   *vfsp = (vfs_t *) arg;
+        struct list_head        tmp;
+        struct vfs_sync_work    *work, *n;
+        daemonize("xfssyncd");
+        vfsp->vfs_sync_work.w_vfs = vfsp;
+        vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
+        vfsp->vfs_sync_task = current;
+        wmb();
+        wake_up(&vfsp->vfs_wait_sync_task);
+        INIT_LIST_HEAD(&tmp);
+        timeleft = (xfs_syncd_centisecs * HZ) / 100;
+        for (;;) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                timeleft = schedule_timeout(timeleft);
+                /* swsusp */
+                try_to_freeze(PF_FREEZE);
+                if (vfsp->vfs_flag & VFS_UMOUNT)
+                        break;
+                spin_lock(&vfsp->vfs_sync_lock);
+                /*
+                 * We can get woken by laptop mode, to do a sync -
+                 * that's the (only!) case where the list would be
+                 * empty with time remaining.
+                 */
+                if (!timeleft || list_empty(&vfsp->vfs_sync_list)) {
+                        if (!timeleft)
+                                timeleft = (xfs_syncd_centisecs * HZ) / 100;
+                        INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list);
+                        list_add_tail(&vfsp->vfs_sync_work.w_list,
+                                        &vfsp->vfs_sync_list);
+                }
+                list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list)
+                        list_move(&work->w_list, &tmp);
+                spin_unlock(&vfsp->vfs_sync_lock);
+                list_for_each_entry_safe(work, n, &tmp, w_list) {
+                        (*work->w_syncer)(vfsp, work->w_data);
+                        list_del(&work->w_list);
+                        if (work == &vfsp->vfs_sync_work)
+                                continue;
+                        kmem_free(work, sizeof(struct vfs_sync_work));
+                }
+        }
+        vfsp->vfs_sync_task = NULL;
+        wmb();
+        wake_up(&vfsp->vfs_wait_sync_task);
+        return 0;
+}
+STATIC int
+linvfs_start_syncd(
+        vfs_t                   *vfsp)
+{
+        int                     pid;
+        pid = kernel_thread(xfssyncd, (void *) vfsp,
+                        CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (pid < 0)
+                return -pid;
+        wait_event(vfsp->vfs_wait_sync_task, vfsp->vfs_sync_task);
+        return 0;
+}
+STATIC void
+linvfs_stop_syncd(
+        vfs_t                   *vfsp)
+{
+        vfsp->vfs_flag |= VFS_UMOUNT;
+        wmb();
+        wake_up_process(vfsp->vfs_sync_task);
+        wait_event(vfsp->vfs_wait_sync_task, !vfsp->vfs_sync_task);
+}
+STATIC void
+linvfs_put_super(
+        struct super_block      *sb)
+{
+        vfs_t                   *vfsp = LINVFS_GET_VFS(sb);
+        int                     error;
+        linvfs_stop_syncd(vfsp);
+        VFS_SYNC(vfsp, SYNC_ATTR|SYNC_DELWRI, NULL, error);
+        if (!error)
+                VFS_UNMOUNT(vfsp, 0, NULL, error);
+        if (error) {
+                printk("XFS unmount got error %d\n", error);
+                printk("%s: vfsp/0x%p left dangling!\n", __FUNCTION__, vfsp);
+                return;
+        }
+        vfs_deallocate(vfsp);
+}
+STATIC void
+linvfs_write_super(
+        struct super_block      *sb)
+{
+        vfs_t                   *vfsp = LINVFS_GET_VFS(sb);
+        int                     error;
+        if (sb->s_flags & MS_RDONLY) {
+                sb->s_dirt = 0; /* paranoia */
+                return;
+        }
+        /* Push the log and superblock a little */
+        VFS_SYNC(vfsp, SYNC_FSDATA, NULL, error);
+        sb->s_dirt = 0;
+}
+STATIC int
+linvfs_sync_super(
+        struct super_block      *sb,
+        int                     wait)
+{
+        vfs_t           *vfsp = LINVFS_GET_VFS(sb);
+        int             error;
+        int             flags = SYNC_FSDATA;
+        if (wait)
+                flags |= SYNC_WAIT;
+        VFS_SYNC(vfsp, flags, NULL, error);
+        sb->s_dirt = 0;
+        if (unlikely(laptop_mode)) {
+                int     prev_sync_seq = vfsp->vfs_sync_seq;
+                /*
+                 * The disk must be active because we're syncing.
+                 * We schedule xfssyncd now (now that the disk is
+                 * active) instead of later (when it might not be).
+                 */
+                wake_up_process(vfsp->vfs_sync_task);
+                /*
+                 * We have to wait for the sync iteration to complete.
+                 * If we don't, the disk activity caused by the sync
+                 * will come after the sync is completed, and that
+                 * triggers another sync from laptop mode.
+                 */
+                wait_event(vfsp->vfs_wait_single_sync_task,
+                                vfsp->vfs_sync_seq != prev_sync_seq);
+        }
+        return -error;
+}
+STATIC int
+linvfs_statfs(
+        struct super_block      *sb,
+        struct kstatfs          *statp)
+{
+        vfs_t                   *vfsp = LINVFS_GET_VFS(sb);
+        int                     error;
+        VFS_STATVFS(vfsp, statp, NULL, error);
+        return -error;
+}
+STATIC int
+linvfs_remount(
+        struct super_block      *sb,
+        int                     *flags,
+        char                    *options)
+{
+        vfs_t                   *vfsp = LINVFS_GET_VFS(sb);
+        struct xfs_mount_args   *args = xfs_args_allocate(sb);
+        int                     error;
+        VFS_PARSEARGS(vfsp, options, args, 1, error);
+        if (!error)
+                VFS_MNTUPDATE(vfsp, flags, args, error);
+        kmem_free(args, sizeof(*args));
+        return -error;
+}
+STATIC void
+linvfs_freeze_fs(
+        struct super_block      *sb)
+{
+        VFS_FREEZE(LINVFS_GET_VFS(sb));
+}
+STATIC int
+linvfs_show_options(
+        struct seq_file         *m,
+        struct vfsmount         *mnt)
+{
+        struct vfs              *vfsp = LINVFS_GET_VFS(mnt->mnt_sb);
+        int                     error;
+        VFS_SHOWARGS(vfsp, m, error);
+        return error;
+}
+STATIC int
+linvfs_getxstate(
+        struct super_block      *sb,
+        struct fs_quota_stat    *fqs)
+{
+        struct vfs              *vfsp = LINVFS_GET_VFS(sb);
+        int                     error;
+        VFS_QUOTACTL(vfsp, Q_XGETQSTAT, 0, (caddr_t)fqs, error);
+        return -error;
+}
+STATIC int
+linvfs_setxstate(
+        struct super_block      *sb,
+        unsigned int            flags,
+        int                     op)
+{
+        struct vfs              *vfsp = LINVFS_GET_VFS(sb);
+        int                     error;
+        VFS_QUOTACTL(vfsp, op, 0, (caddr_t)&flags, error);
+        return -error;
+}
+STATIC int
+linvfs_getxquota(
+        struct super_block      *sb,
+        int                     type,
+        qid_t                   id,
+        struct fs_disk_quota    *fdq)
+{
+        struct vfs              *vfsp = LINVFS_GET_VFS(sb);
+        int                     error, getmode;
+        getmode = (type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETQUOTA;
+        VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error);
+        return -error;
+}
+STATIC int
+linvfs_setxquota(
+        struct super_block      *sb,
+        int                     type,
+        qid_t                   id,
+        struct fs_disk_quota    *fdq)
+{
+        struct vfs              *vfsp = LINVFS_GET_VFS(sb);
+        int                     error, setmode;
+        setmode = (type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETQLIM;
+        VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error);
+        return -error;
+}
+STATIC int
+linvfs_fill_super(
+        struct super_block      *sb,
+        void                    *data,
+        int                     silent)
+{
+        vnode_t                 *rootvp;
+        struct vfs              *vfsp = vfs_allocate();
+        struct xfs_mount_args   *args = xfs_args_allocate(sb);
+        struct kstatfs          statvfs;
+        int                     error, error2;
+        vfsp->vfs_super = sb;
+        LINVFS_SET_VFS(sb, vfsp);
+        if (sb->s_flags & MS_RDONLY)
+                vfsp->vfs_flag |= VFS_RDONLY;
+        bhv_insert_all_vfsops(vfsp);
+        VFS_PARSEARGS(vfsp, (char *)data, args, 0, error);
+        if (error) {
+                bhv_remove_all_vfsops(vfsp, 1);
+                goto fail_vfsop;
+        }
+        sb_min_blocksize(sb, BBSIZE);
+#ifdef CONFIG_XFS_EXPORT
+        sb->s_export_op = &linvfs_export_ops;
+#endif
+        sb->s_qcop = &linvfs_qops;
+        sb->s_op = &linvfs_sops;
+        VFS_MOUNT(vfsp, args, NULL, error);
+        if (error) {
+                bhv_remove_all_vfsops(vfsp, 1);
+                goto fail_vfsop;
+        }
+        VFS_STATVFS(vfsp, &statvfs, NULL, error);
+        if (error)
+                goto fail_unmount;
+        sb->s_dirt = 1;
+        sb->s_magic = statvfs.f_type;
+        sb->s_blocksize = statvfs.f_bsize;
+        sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1;
+        sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+        sb->s_time_gran = 1;
+        set_posix_acl_flag(sb);
+        VFS_ROOT(vfsp, &rootvp, error);
+        if (error)
+                goto fail_unmount;
+        sb->s_root = d_alloc_root(LINVFS_GET_IP(rootvp));
+        if (!sb->s_root) {
+                error = ENOMEM;
+                goto fail_vnrele;
+        }
+        if (is_bad_inode(sb->s_root->d_inode)) {
+                error = EINVAL;
+                goto fail_vnrele;
+        }
+        if ((error = linvfs_start_syncd(vfsp)))
+                goto fail_vnrele;
+        vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address);
+        kmem_free(args, sizeof(*args));
+        return 0;
+fail_vnrele:
+        if (sb->s_root) {
+                dput(sb->s_root);
+                sb->s_root = NULL;
+        } else {
+                VN_RELE(rootvp);
+        }
+fail_unmount:
+        VFS_UNMOUNT(vfsp, 0, NULL, error2);
+fail_vfsop:
+        vfs_deallocate(vfsp);
+        kmem_free(args, sizeof(*args));
+        return -error;
+}
+STATIC struct super_block *
+linvfs_get_sb(
+        struct file_system_type *fs_type,
+        int                     flags,
+        const char              *dev_name,
+        void                    *data)
+{
+        return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super);
+}
+STATIC struct super_operations linvfs_sops = {
+        .alloc_inode            = linvfs_alloc_inode,
+        .destroy_inode          = linvfs_destroy_inode,
+        .write_inode            = linvfs_write_inode,
+        .clear_inode            = linvfs_clear_inode,
+        .put_super              = linvfs_put_super,
+        .write_super            = linvfs_write_super,
+        .sync_fs                = linvfs_sync_super,
+        .write_super_lockfs     = linvfs_freeze_fs,
+        .statfs                 = linvfs_statfs,
+        .remount_fs             = linvfs_remount,
+        .show_options           = linvfs_show_options,
+};
+STATIC struct quotactl_ops linvfs_qops = {
+        .get_xstate             = linvfs_getxstate,
+        .set_xstate             = linvfs_setxstate,
+        .get_xquota             = linvfs_getxquota,
+        .set_xquota             = linvfs_setxquota,
+};
+STATIC struct file_system_type xfs_fs_type = {
+        .owner                  = THIS_MODULE,
+        .name                   = "xfs",
+        .get_sb                 = linvfs_get_sb,
+        .kill_sb                = kill_block_super,
+        .fs_flags               = FS_REQUIRES_DEV,
+};
+STATIC int __init
+init_xfs_fs( void )
+{
+        int                     error;
+        struct sysinfo          si;
+        static char             message[] __initdata = KERN_INFO \
+                XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
+        printk(message);
+        si_meminfo(&si);
+        xfs_physmem = si.totalram;
+        ktrace_init(64);
+        error = init_inodecache();
+        if (error < 0)
+                goto undo_inodecache;
+        error = pagebuf_init();
+        if (error < 0)
+                goto undo_pagebuf;
+        vn_init();
+        xfs_init();
+        uuid_init();
+        vfs_initquota();
+        error = register_filesystem(&xfs_fs_type);
+        if (error)
+                goto undo_register;
+        XFS_DM_INIT(&xfs_fs_type);
+        return 0;
+undo_register:
+        pagebuf_terminate();
+undo_pagebuf:
+        destroy_inodecache();
+undo_inodecache:
+        return error;
+}
+STATIC void __exit
+exit_xfs_fs( void )
+{
+        vfs_exitquota();
+        XFS_DM_EXIT(&xfs_fs_type);
+        unregister_filesystem(&xfs_fs_type);
+        xfs_cleanup();
+        pagebuf_terminate();
+        destroy_inodecache();
+        ktrace_uninit();
+}
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
new file mode 100644
index 000000000000..ec7e0035c731
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+#ifdef CONFIG_XFS_DMAPI
+# define vfs_insertdmapi(vfs)   vfs_insertops(vfsp, &xfs_dmops)
+# define vfs_initdmapi()        dmapi_init()
+# define vfs_exitdmapi()        dmapi_uninit()
+#else
+# define vfs_insertdmapi(vfs)   do { } while (0)
+# define vfs_initdmapi()        do { } while (0)
+# define vfs_exitdmapi()        do { } while (0)
+#endif
+#ifdef CONFIG_XFS_QUOTA
+# define vfs_insertquota(vfs)   vfs_insertops(vfsp, &xfs_qmops)
+extern void xfs_qm_init(void);
+extern void xfs_qm_exit(void);
+# define vfs_initquota()        xfs_qm_init()
+# define vfs_exitquota()        xfs_qm_exit()
+#else
+# define vfs_insertquota(vfs)   do { } while (0)
+# define vfs_initquota()        do { } while (0)
+# define vfs_exitquota()        do { } while (0)
+#endif
+#ifdef CONFIG_XFS_POSIX_ACL
+# define XFS_ACL_STRING         "ACLs, "
+# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL)
+#else
+# define XFS_ACL_STRING
+# define set_posix_acl_flag(sb) do { } while (0)
+#endif
+#ifdef CONFIG_XFS_SECURITY
+# define XFS_SECURITY_STRING    "security attributes, "
+# define ENOSECURITY            0
+#else
+# define XFS_SECURITY_STRING
+# define ENOSECURITY            EOPNOTSUPP
+#endif
+#ifdef CONFIG_XFS_RT
+# define XFS_REALTIME_STRING    "realtime, "
+#else
+# define XFS_REALTIME_STRING
+#endif
+#if XFS_BIG_BLKNOS
+# if XFS_BIG_INUMS
+#  define XFS_BIGFS_STRING      "large block/inode numbers, "
+# else
+#  define XFS_BIGFS_STRING      "large block numbers, "
+# endif
+#else
+# define XFS_BIGFS_STRING
+#endif
+#ifdef CONFIG_XFS_TRACE
+# define XFS_TRACE_STRING       "tracing, "
+#else
+# define XFS_TRACE_STRING
+#endif
+#ifdef CONFIG_XFS_DMAPI
+# define XFS_DMAPI_STRING       "dmapi support, "
+#else
+# define XFS_DMAPI_STRING
+#endif
+#ifdef DEBUG
+# define XFS_DBG_STRING         "debug"
+#else
+# define XFS_DBG_STRING         "no debug"
+#endif
+#define XFS_BUILD_OPTIONS       XFS_ACL_STRING \
+                                XFS_SECURITY_STRING \
+                                XFS_REALTIME_STRING \
+                                XFS_BIGFS_STRING \
+                                XFS_TRACE_STRING \
+                                XFS_DMAPI_STRING \
+                                XFS_DBG_STRING /* DBG must be last */
+#define LINVFS_GET_VFS(s) \
+        (vfs_t *)((s)->s_fs_info)
+#define LINVFS_SET_VFS(s, vfsp) \
+        ((s)->s_fs_info = vfsp)
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_buftarg;
+struct block_device;
+extern __uint64_t xfs_max_file_offset(unsigned int);
+extern void xfs_initialize_vnode(bhv_desc_t *, vnode_t *, bhv_desc_t *, int);
+extern void xfs_flush_inode(struct xfs_inode *);
+extern void xfs_flush_device(struct xfs_inode *);
+extern int  xfs_blkdev_get(struct xfs_mount *, const char *,
+                                struct block_device **);
+extern void xfs_blkdev_put(struct block_device *);
+extern struct export_operations linvfs_export_ops;
+#endif  /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
new file mode 100644
index 000000000000..0dc010356f4d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2001-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_rw.h"
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+static struct ctl_table_header *xfs_table_header;
+#ifdef CONFIG_PROC_FS
+STATIC int
+xfs_stats_clear_proc_handler(
+        ctl_table       *ctl,
+        int             write,
+        struct file     *filp,
+        void            __user *buffer,
+        size_t          *lenp,
+        loff_t          *ppos)
+{
+        int             c, ret, *valp = ctl->data;
+        __uint32_t      vn_active;
+        ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos);
+        if (!ret && write && *valp) {
+                printk("XFS Clearing xfsstats\n");
+                for (c = 0; c < NR_CPUS; c++) {
+                        if (!cpu_possible(c)) continue;
+                        preempt_disable();
+                        /* save vn_active, it's a universal truth! */
+                        vn_active = per_cpu(xfsstats, c).vn_active;
+                        memset(&per_cpu(xfsstats, c), 0,
+                               sizeof(struct xfsstats));
+                        per_cpu(xfsstats, c).vn_active = vn_active;
+                        preempt_enable();
+                }
+                xfs_stats_clear = 0;
+        }
+        return ret;
+}
+#endif /* CONFIG_PROC_FS */
+STATIC ctl_table xfs_table[] = {
+        {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL, 
+        &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max},
+        {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max},
+        {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL, 
+        &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max},
+        {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL, 
+        &xfs_params.panic_mask.min, &xfs_params.panic_mask.max},
+        {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL, 
+        &xfs_params.error_level.min, &xfs_params.error_level.max},
+        {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL, 
+        &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max},
+        {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max},
+        {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max},
+        {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max},
+        
+        {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max},
+        {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max},
+        {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL,
+        &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max},
+        {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val,
+        sizeof(int), 0644, NULL, &proc_dointvec_minmax,
+        &sysctl_intvec, NULL, 
+        &xfs_params.rotorstep.min, &xfs_params.rotorstep.max},
+        /* please keep this the last entry */
+#ifdef CONFIG_PROC_FS
+        {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val,
+        sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler,
+        &sysctl_intvec, NULL, 
+        &xfs_params.stats_clear.min, &xfs_params.stats_clear.max},
+#endif /* CONFIG_PROC_FS */
+        {0}
+};
+STATIC ctl_table xfs_dir_table[] = {
+        {FS_XFS, "xfs", NULL, 0, 0555, xfs_table},
+        {0}
+};
+STATIC ctl_table xfs_root_table[] = {
+        {CTL_FS, "fs",  NULL, 0, 0555, xfs_dir_table},
+        {0}
+};
+void
+xfs_sysctl_register(void)
+{
+        xfs_table_header = register_sysctl_table(xfs_root_table, 1);
+}
+void
+xfs_sysctl_unregister(void)
+{
+        if (xfs_table_header)
+                unregister_sysctl_table(xfs_table_header);
+}
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
new file mode 100644
index 000000000000..a39a95020a58
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2001-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+#include <linux/sysctl.h>
+/*
+ * Tunable xfs parameters
+ */
+typedef struct xfs_sysctl_val {
+        int min;
+        int val;
+        int max;
+} xfs_sysctl_val_t;
+typedef struct xfs_param {
+        xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
+        xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
+                                         * not a member of parent dir GID. */
+        xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
+        xfs_sysctl_val_t panic_mask;    /* bitmask to cause panic on errors. */
+        xfs_sysctl_val_t error_level;   /* Degree of reporting for problems  */
+        xfs_sysctl_val_t syncd_timer;   /* Interval between xfssyncd wakeups */
+        xfs_sysctl_val_t stats_clear;   /* Reset all XFS statistics to zero. */
+        xfs_sysctl_val_t inherit_sync;  /* Inherit the "sync" inode flag. */
+        xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
+        xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
+        xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
+        xfs_sysctl_val_t xfs_buf_age;   /* Metadata buffer age before flush. */
+        xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
+        xfs_sysctl_val_t rotorstep;     /* inode32 AG rotoring control knob */
+} xfs_param_t;
+/*
+ * xfs_error_level:
+ *
+ * How much error reporting will be done when internal problems are
+ * encountered.  These problems normally return an EFSCORRUPTED to their
+ * caller, with no other information reported.
+ *
+ * 0    No error reports
+ * 1    Report EFSCORRUPTED errors that will cause a filesystem shutdown
+ * 5    Report all EFSCORRUPTED errors (all of the above errors, plus any
+ *      additional errors that are known to not cause shutdowns)
+ *
+ * xfs_panic_mask bit 0x8 turns the error reports into panics
+ */
+enum {
+        /* XFS_REFCACHE_SIZE = 1 */
+        /* XFS_REFCACHE_PURGE = 2 */
+        XFS_RESTRICT_CHOWN = 3,
+        XFS_SGID_INHERIT = 4,
+        XFS_SYMLINK_MODE = 5,
+        XFS_PANIC_MASK = 6,
+        XFS_ERRLEVEL = 7,
+        XFS_SYNCD_TIMER = 8,
+        /* XFS_PROBE_DMAPI = 9 */
+        /* XFS_PROBE_IOOPS = 10 */
+        /* XFS_PROBE_QUOTA = 11 */
+        XFS_STATS_CLEAR = 12,
+        XFS_INHERIT_SYNC = 13,
+        XFS_INHERIT_NODUMP = 14,
+        XFS_INHERIT_NOATIME = 15,
+        XFS_BUF_TIMER = 16,
+        XFS_BUF_AGE = 17,
+        /* XFS_IO_BYPASS = 18 */
+        XFS_INHERIT_NOSYM = 19,
+        XFS_ROTORSTEP = 20,
+};
+extern xfs_param_t      xfs_params;
+#ifdef CONFIG_SYSCTL
+extern void xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+# define xfs_sysctl_register()          do { } while (0)
+# define xfs_sysctl_unregister()        do { } while (0)
+#endif /* CONFIG_SYSCTL */
+#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h
new file mode 100644
index 000000000000..96f96394417e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_version.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * Dummy file that can contain a timestamp to put into the
+ * XFS init string, to help users keep track of what they're
+ * running
+ */
+#ifndef __XFS_VERSION_H__
+#define __XFS_VERSION_H__
+#define XFS_VERSION_STRING "SGI XFS"
+#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
new file mode 100644
index 000000000000..669c61644959
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_macros.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_clnt.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_imap.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+int
+vfs_mount(
+        struct bhv_desc         *bdp,
+        struct xfs_mount_args   *args,
+        struct cred             *cr)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_mount)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr));
+}
+int
+vfs_parseargs(
+        struct bhv_desc         *bdp,
+        char                    *s,
+        struct xfs_mount_args   *args,
+        int                     f)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_parseargs)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f));
+}
+int
+vfs_showargs(
+        struct bhv_desc         *bdp,
+        struct seq_file         *m)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_showargs)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_showargs)(next, m));
+}
+int
+vfs_unmount(
+        struct bhv_desc         *bdp,
+        int                     fl,
+        struct cred             *cr)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_unmount)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr));
+}
+int
+vfs_mntupdate(
+        struct bhv_desc         *bdp,
+        int                     *fl,
+        struct xfs_mount_args   *args)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_mntupdate)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args));
+}
+int
+vfs_root(
+        struct bhv_desc         *bdp,
+        struct vnode            **vpp)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_root)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_root)(next, vpp));
+}
+int
+vfs_statvfs(
+        struct bhv_desc         *bdp,
+        xfs_statfs_t            *sp,
+        struct vnode            *vp)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_statvfs)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_statvfs)(next, sp, vp));
+}
+int
+vfs_sync(
+        struct bhv_desc         *bdp,
+        int                     fl,
+        struct cred             *cr)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_sync)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr));
+}
+int
+vfs_vget(
+        struct bhv_desc         *bdp,
+        struct vnode            **vpp,
+        struct fid              *fidp)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_vget)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp));
+}
+int
+vfs_dmapiops(
+        struct bhv_desc         *bdp,
+        caddr_t                 addr)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_dmapiops)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr));
+}
+int
+vfs_quotactl(
+        struct bhv_desc         *bdp,
+        int                     cmd,
+        int                     id,
+        caddr_t                 addr)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_quotactl)
+                next = BHV_NEXT(next);
+        return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr));
+}
+void
+vfs_init_vnode(
+        struct bhv_desc         *bdp,
+        struct vnode            *vp,
+        struct bhv_desc         *bp,
+        int                     unlock)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_init_vnode)
+                next = BHV_NEXT(next);
+        ((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock));
+}
+void
+vfs_force_shutdown(
+        struct bhv_desc         *bdp,
+        int                     fl,
+        char                    *file,
+        int                     line)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_force_shutdown)
+                next = BHV_NEXT(next);
+        ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line));
+}
+void
+vfs_freeze(
+        struct bhv_desc         *bdp)
+{
+        struct bhv_desc         *next = bdp;
+        ASSERT(next);
+        while (! (bhvtovfsops(next))->vfs_freeze)
+                next = BHV_NEXT(next);
+        ((*bhvtovfsops(next)->vfs_freeze)(next));
+}
+vfs_t *
+vfs_allocate( void )
+{
+        struct vfs              *vfsp;
+        vfsp = kmem_zalloc(sizeof(vfs_t), KM_SLEEP);
+        bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
+        INIT_LIST_HEAD(&vfsp->vfs_sync_list);
+        spin_lock_init(&vfsp->vfs_sync_lock);
+        init_waitqueue_head(&vfsp->vfs_wait_sync_task);
+        init_waitqueue_head(&vfsp->vfs_wait_single_sync_task);
+        return vfsp;
+}
+void
+vfs_deallocate(
+        struct vfs              *vfsp)
+{
+        bhv_head_destroy(VFS_BHVHEAD(vfsp));
+        kmem_free(vfsp, sizeof(vfs_t));
+}
+void
+vfs_insertops(
+        struct vfs              *vfsp,
+        struct bhv_vfsops       *vfsops)
+{
+        struct bhv_desc         *bdp;
+        bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP);
+        bhv_desc_init(bdp, NULL, vfsp, vfsops);
+        bhv_insert(&vfsp->vfs_bh, bdp);
+}
+void
+vfs_insertbhv(
+        struct vfs              *vfsp,
+        struct bhv_desc         *bdp,
+        struct vfsops           *vfsops,
+        void                    *mount)
+{
+        bhv_desc_init(bdp, mount, vfsp, vfsops);
+        bhv_insert_initial(&vfsp->vfs_bh, bdp);
+}
+void
+bhv_remove_vfsops(
+        struct vfs              *vfsp,
+        int                     pos)
+{
+        struct bhv_desc         *bhv;
+        bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos);
+        if (!bhv)
+                return;
+        bhv_remove(&vfsp->vfs_bh, bhv);
+        kmem_free(bhv, sizeof(*bhv));
+}
+void
+bhv_remove_all_vfsops(
+        struct vfs              *vfsp,
+        int                     freebase)
+{
+        struct xfs_mount        *mp;
+        bhv_remove_vfsops(vfsp, VFS_POSITION_QM);
+        bhv_remove_vfsops(vfsp, VFS_POSITION_DM);
+        if (!freebase)
+                return;
+        mp = XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfsp), &xfs_vfsops));
+        VFS_REMOVEBHV(vfsp, &mp->m_bhv);
+        xfs_mount_free(mp, 0);
+}
+void
+bhv_insert_all_vfsops(
+        struct vfs              *vfsp)
+{
+        struct xfs_mount        *mp;
+        mp = xfs_mount_init();
+        vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp);
+        vfs_insertdmapi(vfsp);
+        vfs_insertquota(vfsp);
+}
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
new file mode 100644
index 000000000000..76493991578f
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_VFS_H__
+#define __XFS_VFS_H__
+#include <linux/vfs.h>
+#include "xfs_fs.h"
+struct fid;
+struct vfs;
+struct cred;
+struct vnode;
+struct kstatfs;
+struct seq_file;
+struct super_block;
+struct xfs_mount_args;
+typedef struct kstatfs xfs_statfs_t;
+typedef struct vfs_sync_work {
+        struct list_head        w_list;
+        struct vfs              *w_vfs;
+        void                    *w_data;        /* syncer routine argument */
+        void                    (*w_syncer)(struct vfs *, void *);
+} vfs_sync_work_t;
+typedef struct vfs {
+        u_int                   vfs_flag;       /* flags */
+        xfs_fsid_t              vfs_fsid;       /* file system ID */
+        xfs_fsid_t              *vfs_altfsid;   /* An ID fixed for life of FS */
+        bhv_head_t              vfs_bh;         /* head of vfs behavior chain */
+        struct super_block      *vfs_super;     /* generic superblock pointer */
+        struct task_struct      *vfs_sync_task; /* generalised sync thread */
+        vfs_sync_work_t         vfs_sync_work;  /* work item for VFS_SYNC */
+        struct list_head        vfs_sync_list;  /* sync thread work item list */
+        spinlock_t              vfs_sync_lock;  /* work item list lock */
+        int                     vfs_sync_seq;   /* sync thread generation no. */
+        wait_queue_head_t       vfs_wait_single_sync_task;
+        wait_queue_head_t       vfs_wait_sync_task;
+} vfs_t;
+#define vfs_fbhv                vfs_bh.bh_first /* 1st on vfs behavior chain */
+#define bhvtovfs(bdp)           ( (struct vfs *)BHV_VOBJ(bdp) )
+#define bhvtovfsops(bdp)        ( (struct vfsops *)BHV_OPS(bdp) )
+#define VFS_BHVHEAD(vfs)        ( &(vfs)->vfs_bh )
+#define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) )
+#define VFS_POSITION_BASE       BHV_POSITION_BASE       /* chain bottom */
+#define VFS_POSITION_TOP        BHV_POSITION_TOP        /* chain top */
+#define VFS_POSITION_INVALID    BHV_POSITION_INVALID    /* invalid pos. num */
+typedef enum {
+        VFS_BHV_UNKNOWN,        /* not specified */
+        VFS_BHV_XFS,            /* xfs */
+        VFS_BHV_DM,             /* data migration */
+        VFS_BHV_QM,             /* quota manager */
+        VFS_BHV_IO,             /* IO path */
+        VFS_BHV_END             /* housekeeping end-of-range */
+} vfs_bhv_t;
+#define VFS_POSITION_XFS        (BHV_POSITION_BASE)
+#define VFS_POSITION_DM         (VFS_POSITION_BASE+10)
+#define VFS_POSITION_QM         (VFS_POSITION_BASE+20)
+#define VFS_POSITION_IO         (VFS_POSITION_BASE+30)
+#define VFS_RDONLY              0x0001  /* read-only vfs */
+#define VFS_GRPID               0x0002  /* group-ID assigned from directory */
+#define VFS_DMI                 0x0004  /* filesystem has the DMI enabled */
+#define VFS_UMOUNT              0x0008  /* unmount in progress */
+#define VFS_END                 0x0008  /* max flag */
+#define SYNC_ATTR               0x0001  /* sync attributes */
+#define SYNC_CLOSE              0x0002  /* close file system down */
+#define SYNC_DELWRI             0x0004  /* look at delayed writes */
+#define SYNC_WAIT               0x0008  /* wait for i/o to complete */
+#define SYNC_BDFLUSH            0x0010  /* BDFLUSH is calling -- don't block */
+#define SYNC_FSDATA             0x0020  /* flush fs data (e.g. superblocks) */
+#define SYNC_REFCACHE           0x0040  /* prune some of the nfs ref cache */
+#define SYNC_REMOUNT            0x0080  /* remount readonly, no dummy LRs */
+typedef int     (*vfs_mount_t)(bhv_desc_t *,
+                                struct xfs_mount_args *, struct cred *);
+typedef int     (*vfs_parseargs_t)(bhv_desc_t *, char *,
+                                struct xfs_mount_args *, int);
+typedef int     (*vfs_showargs_t)(bhv_desc_t *, struct seq_file *);
+typedef int     (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *);
+typedef int     (*vfs_mntupdate_t)(bhv_desc_t *, int *,
+                                struct xfs_mount_args *);
+typedef int     (*vfs_root_t)(bhv_desc_t *, struct vnode **);
+typedef int     (*vfs_statvfs_t)(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+typedef int     (*vfs_sync_t)(bhv_desc_t *, int, struct cred *);
+typedef int     (*vfs_vget_t)(bhv_desc_t *, struct vnode **, struct fid *);
+typedef int     (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t);
+typedef int     (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t);
+typedef void    (*vfs_init_vnode_t)(bhv_desc_t *,
+                                struct vnode *, bhv_desc_t *, int);
+typedef void    (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int);
+typedef void    (*vfs_freeze_t)(bhv_desc_t *);
+typedef struct vfsops {
+        bhv_position_t          vf_position;    /* behavior chain position */
+        vfs_mount_t             vfs_mount;      /* mount file system */
+        vfs_parseargs_t         vfs_parseargs;  /* parse mount options */
+        vfs_showargs_t          vfs_showargs;   /* unparse mount options */
+        vfs_unmount_t           vfs_unmount;    /* unmount file system */
+        vfs_mntupdate_t         vfs_mntupdate;  /* update file system options */
+        vfs_root_t              vfs_root;       /* get root vnode */
+        vfs_statvfs_t           vfs_statvfs;    /* file system statistics */
+        vfs_sync_t              vfs_sync;       /* flush files */
+        vfs_vget_t              vfs_vget;       /* get vnode from fid */
+        vfs_dmapiops_t          vfs_dmapiops;   /* data migration */
+        vfs_quotactl_t          vfs_quotactl;   /* disk quota */
+        vfs_init_vnode_t        vfs_init_vnode; /* initialize a new vnode */
+        vfs_force_shutdown_t    vfs_force_shutdown;     /* crash and burn */
+        vfs_freeze_t            vfs_freeze;     /* freeze fs for snapshot */
+} vfsops_t;
+/*
+ * VFS's.  Operates on vfs structure pointers (starts at bhv head).
+ */
+#define VHEAD(v)                        ((v)->vfs_fbhv)
+#define VFS_MOUNT(v, ma,cr, rv)         ((rv) = vfs_mount(VHEAD(v), ma,cr))
+#define VFS_PARSEARGS(v, o,ma,f, rv)    ((rv) = vfs_parseargs(VHEAD(v), o,ma,f))
+#define VFS_SHOWARGS(v, m, rv)          ((rv) = vfs_showargs(VHEAD(v), m))
+#define VFS_UNMOUNT(v, f, cr, rv)       ((rv) = vfs_unmount(VHEAD(v), f,cr))
+#define VFS_MNTUPDATE(v, fl, args, rv)  ((rv) = vfs_mntupdate(VHEAD(v), fl, args))
+#define VFS_ROOT(v, vpp, rv)            ((rv) = vfs_root(VHEAD(v), vpp))
+#define VFS_STATVFS(v, sp,vp, rv)       ((rv) = vfs_statvfs(VHEAD(v), sp,vp))
+#define VFS_SYNC(v, flag,cr, rv)        ((rv) = vfs_sync(VHEAD(v), flag,cr))
+#define VFS_VGET(v, vpp,fidp, rv)       ((rv) = vfs_vget(VHEAD(v), vpp,fidp))
+#define VFS_DMAPIOPS(v, p, rv)          ((rv) = vfs_dmapiops(VHEAD(v), p))
+#define VFS_QUOTACTL(v, c,id,p, rv)     ((rv) = vfs_quotactl(VHEAD(v), c,id,p))
+#define VFS_INIT_VNODE(v, vp,b,ul)      ( vfs_init_vnode(VHEAD(v), vp,b,ul) )
+#define VFS_FORCE_SHUTDOWN(v, fl,f,l)   ( vfs_force_shutdown(VHEAD(v), fl,f,l) )
+#define VFS_FREEZE(v)                   ( vfs_freeze(VHEAD(v)) )
+/*
+ * PVFS's.  Operates on behavior descriptor pointers.
+ */
+#define PVFS_MOUNT(b, ma,cr, rv)        ((rv) = vfs_mount(b, ma,cr))
+#define PVFS_PARSEARGS(b, o,ma,f, rv)   ((rv) = vfs_parseargs(b, o,ma,f))
+#define PVFS_SHOWARGS(b, m, rv)         ((rv) = vfs_showargs(b, m))
+#define PVFS_UNMOUNT(b, f,cr, rv)       ((rv) = vfs_unmount(b, f,cr))
+#define PVFS_MNTUPDATE(b, fl, args, rv) ((rv) = vfs_mntupdate(b, fl, args))
+#define PVFS_ROOT(b, vpp, rv)           ((rv) = vfs_root(b, vpp))
+#define PVFS_STATVFS(b, sp,vp, rv)      ((rv) = vfs_statvfs(b, sp,vp))
+#define PVFS_SYNC(b, flag,cr, rv)       ((rv) = vfs_sync(b, flag,cr))
+#define PVFS_VGET(b, vpp,fidp, rv)      ((rv) = vfs_vget(b, vpp,fidp))
+#define PVFS_DMAPIOPS(b, p, rv)         ((rv) = vfs_dmapiops(b, p))
+#define PVFS_QUOTACTL(b, c,id,p, rv)    ((rv) = vfs_quotactl(b, c,id,p))
+#define PVFS_INIT_VNODE(b, vp,b2,ul)    ( vfs_init_vnode(b, vp,b2,ul) )
+#define PVFS_FORCE_SHUTDOWN(b, fl,f,l)  ( vfs_force_shutdown(b, fl,f,l) )
+#define PVFS_FREEZE(b)                  ( vfs_freeze(b) )
+extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *);
+extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int);
+extern int vfs_showargs(bhv_desc_t *, struct seq_file *);
+extern int vfs_unmount(bhv_desc_t *, int, struct cred *);
+extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *);
+extern int vfs_root(bhv_desc_t *, struct vnode **);
+extern int vfs_statvfs(bhv_desc_t *, xfs_statfs_t *, struct vnode *);
+extern int vfs_sync(bhv_desc_t *, int, struct cred *);
+extern int vfs_vget(bhv_desc_t *, struct vnode **, struct fid *);
+extern int vfs_dmapiops(bhv_desc_t *, caddr_t);
+extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t);
+extern void vfs_init_vnode(bhv_desc_t *, struct vnode *, bhv_desc_t *, int);
+extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int);
+extern void vfs_freeze(bhv_desc_t *);
+typedef struct bhv_vfsops {
+        struct vfsops           bhv_common;
+        void *                  bhv_custom;
+} bhv_vfsops_t;
+#define vfs_bhv_lookup(v, id)   ( bhv_lookup_range(&(v)->vfs_bh, (id), (id)) )
+#define vfs_bhv_custom(b)       ( ((bhv_vfsops_t *)BHV_OPS(b))->bhv_custom )
+#define vfs_bhv_set_custom(b,o) ( (b)->bhv_custom = (void *)(o))
+#define vfs_bhv_clr_custom(b)   ( (b)->bhv_custom = NULL )
+extern vfs_t *vfs_allocate(void);
+extern void vfs_deallocate(vfs_t *);
+extern void vfs_insertops(vfs_t *, bhv_vfsops_t *);
+extern void vfs_insertbhv(vfs_t *, bhv_desc_t *, vfsops_t *, void *);
+extern void bhv_insert_all_vfsops(struct vfs *);
+extern void bhv_remove_all_vfsops(struct vfs *, int);
+extern void bhv_remove_vfsops(struct vfs *, int);
+#define fs_frozen(vfsp)         ((vfsp)->vfs_super->s_frozen)
+#define fs_check_frozen(vfsp, level) \
+        vfs_check_frozen(vfsp->vfs_super, level);
+#endif  /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
new file mode 100644
index 000000000000..849c61c74f3c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+uint64_t vn_generation;         /* vnode generation number */
+DEFINE_SPINLOCK(vnumber_lock);
+/*
+ * Dedicated vnode inactive/reclaim sync semaphores.
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC                  37
+#define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
+sv_t vsync[NVSYNC];
+/*
+ * Translate stat(2) file types to vnode types and vice versa.
+ * Aware of numeric order of S_IFMT and vnode type values.
+ */
+enum vtype iftovt_tab[] = {
+        VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+        VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
+};
+u_short vttoif_tab[] = {
+        0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK
+};
+void
+vn_init(void)
+{
+        register sv_t *svp;
+        register int i;
+        for (svp = vsync, i = 0; i < NVSYNC; i++, svp++)
+                init_sv(svp, SV_DEFAULT, "vsy", i);
+}
+/*
+ * Clean a vnode of filesystem-specific data and prepare it for reuse.
+ */
+STATIC int
+vn_reclaim(
+        struct vnode    *vp)
+{
+        int             error;
+        XFS_STATS_INC(vn_reclaim);
+        vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address);
+        /*
+         * Only make the VOP_RECLAIM call if there are behaviors
+         * to call.
+         */
+        if (vp->v_fbhv) {
+                VOP_RECLAIM(vp, error);
+                if (error)
+                        return -error;
+        }
+        ASSERT(vp->v_fbhv == NULL);
+        VN_LOCK(vp);
+        vp->v_flag &= (VRECLM|VWAIT);
+        VN_UNLOCK(vp, 0);
+        vp->v_type = VNON;
+        vp->v_fbhv = NULL;
+#ifdef XFS_VNODE_TRACE
+        ktrace_free(vp->v_trace);
+        vp->v_trace = NULL;
+#endif
+        return 0;
+}
+STATIC void
+vn_wakeup(
+        struct vnode    *vp)
+{
+        VN_LOCK(vp);
+        if (vp->v_flag & VWAIT)
+                sv_broadcast(vptosync(vp));
+        vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED);
+        VN_UNLOCK(vp, 0);
+}
+int
+vn_wait(
+        struct vnode    *vp)
+{
+        VN_LOCK(vp);
+        if (vp->v_flag & (VINACT | VRECLM)) {
+                vp->v_flag |= VWAIT;
+                sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
+                return 1;
+        }
+        VN_UNLOCK(vp, 0);
+        return 0;
+}
+struct vnode *
+vn_initialize(
+        struct inode    *inode)
+{
+        struct vnode    *vp = LINVFS_GET_VP(inode);
+        XFS_STATS_INC(vn_active);
+        XFS_STATS_INC(vn_alloc);
+        vp->v_flag = VMODIFIED;
+        spinlock_init(&vp->v_lock, "v_lock");
+        spin_lock(&vnumber_lock);
+        if (!++vn_generation)   /* v_number shouldn't be zero */
+                vn_generation++;
+        vp->v_number = vn_generation;
+        spin_unlock(&vnumber_lock);
+        ASSERT(VN_CACHED(vp) == 0);
+        /* Initialize the first behavior and the behavior chain head. */
+        vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
+#ifdef  XFS_VNODE_TRACE
+        vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
+        printk("Allocated VNODE_TRACE at 0x%p\n", vp->v_trace);
+#endif  /* XFS_VNODE_TRACE */
+        vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
+        return vp;
+}
+/*
+ * Get a reference on a vnode.
+ */
+vnode_t *
+vn_get(
+        struct vnode    *vp,
+        vmap_t          *vmap)
+{
+        struct inode    *inode;
+        XFS_STATS_INC(vn_get);
+        inode = LINVFS_GET_IP(vp);
+        if (inode->i_state & I_FREEING)
+                return NULL;
+        inode = ilookup(vmap->v_vfsp->vfs_super, vmap->v_ino);
+        if (!inode)     /* Inode not present */
+                return NULL;
+        vn_trace_exit(vp, "vn_get", (inst_t *)__return_address);
+        return vp;
+}
+/*
+ * Revalidate the Linux inode from the vattr.
+ * Note: i_size _not_ updated; we must hold the inode
+ * semaphore when doing that - callers responsibility.
+ */
+void
+vn_revalidate_core(
+        struct vnode    *vp,
+        vattr_t         *vap)
+{
+        struct inode    *inode = LINVFS_GET_IP(vp);
+        inode->i_mode       = VTTOIF(vap->va_type) | vap->va_mode;
+        inode->i_nlink      = vap->va_nlink;
+        inode->i_uid        = vap->va_uid;
+        inode->i_gid        = vap->va_gid;
+        inode->i_blocks     = vap->va_nblocks;
+        inode->i_mtime      = vap->va_mtime;
+        inode->i_ctime      = vap->va_ctime;
+        inode->i_atime      = vap->va_atime;
+        if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        else
+                inode->i_flags &= ~S_IMMUTABLE;
+        if (vap->va_xflags & XFS_XFLAG_APPEND)
+                inode->i_flags |= S_APPEND;
+        else
+                inode->i_flags &= ~S_APPEND;
+        if (vap->va_xflags & XFS_XFLAG_SYNC)
+                inode->i_flags |= S_SYNC;
+        else
+                inode->i_flags &= ~S_SYNC;
+        if (vap->va_xflags & XFS_XFLAG_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        else
+                inode->i_flags &= ~S_NOATIME;
+}
+/*
+ * Revalidate the Linux inode from the vnode.
+ */
+int
+vn_revalidate(
+        struct vnode    *vp)
+{
+        vattr_t         va;
+        int             error;
+        vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address);
+        ASSERT(vp->v_fbhv != NULL);
+        va.va_mask = XFS_AT_STAT|XFS_AT_XFLAGS;
+        VOP_GETATTR(vp, &va, 0, NULL, error);
+        if (!error) {
+                vn_revalidate_core(vp, &va);
+                VUNMODIFY(vp);
+        }
+        return -error;
+}
+/*
+ * purge a vnode from the cache
+ * At this point the vnode is guaranteed to have no references (vn_count == 0)
+ * The caller has to make sure that there are no ways someone could
+ * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
+ */
+void
+vn_purge(
+        struct vnode    *vp,
+        vmap_t          *vmap)
+{
+        vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
+again:
+        /*
+         * Check whether vp has already been reclaimed since our caller
+         * sampled its version while holding a filesystem cache lock that
+         * its VOP_RECLAIM function acquires.
+         */
+        VN_LOCK(vp);
+        if (vp->v_number != vmap->v_number) {
+                VN_UNLOCK(vp, 0);
+                return;
+        }
+        /*
+         * If vp is being reclaimed or inactivated, wait until it is inert,
+         * then proceed.  Can't assume that vnode is actually reclaimed
+         * just because the reclaimed flag is asserted -- a vn_alloc
+         * reclaim can fail.
+         */
+        if (vp->v_flag & (VINACT | VRECLM)) {
+                ASSERT(vn_count(vp) == 0);
+                vp->v_flag |= VWAIT;
+                sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
+                goto again;
+        }
+        /*
+         * Another process could have raced in and gotten this vnode...
+         */
+        if (vn_count(vp) > 0) {
+                VN_UNLOCK(vp, 0);
+                return;
+        }
+        XFS_STATS_DEC(vn_active);
+        vp->v_flag |= VRECLM;
+        VN_UNLOCK(vp, 0);
+        /*
+         * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells
+         * vp's filesystem to flush and invalidate all cached resources.
+         * When vn_reclaim returns, vp should have no private data,
+         * either in a system cache or attached to v_data.
+         */
+        if (vn_reclaim(vp) != 0)
+                panic("vn_purge: cannot reclaim");
+        /*
+         * Wakeup anyone waiting for vp to be reclaimed.
+         */
+        vn_wakeup(vp);
+}
+/*
+ * Add a reference to a referenced vnode.
+ */
+struct vnode *
+vn_hold(
+        struct vnode    *vp)
+{
+        struct inode    *inode;
+        XFS_STATS_INC(vn_hold);
+        VN_LOCK(vp);
+        inode = igrab(LINVFS_GET_IP(vp));
+        ASSERT(inode);
+        VN_UNLOCK(vp, 0);
+        return vp;
+}
+/*
+ *  Call VOP_INACTIVE on last reference.
+ */
+void
+vn_rele(
+        struct vnode    *vp)
+{
+        int             vcnt;
+        int             cache;
+        XFS_STATS_INC(vn_rele);
+        VN_LOCK(vp);
+        vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address);
+        vcnt = vn_count(vp);
+        /*
+         * Since we always get called from put_inode we know
+         * that i_count won't be decremented after we
+         * return.
+         */
+        if (!vcnt) {
+                /*
+                 * As soon as we turn this on, noone can find us in vn_get
+                 * until we turn off VINACT or VRECLM
+                 */
+                vp->v_flag |= VINACT;
+                VN_UNLOCK(vp, 0);
+                /*
+                 * Do not make the VOP_INACTIVE call if there
+                 * are no behaviors attached to the vnode to call.
+                 */
+                if (vp->v_fbhv)
+                        VOP_INACTIVE(vp, NULL, cache);
+                VN_LOCK(vp);
+                if (vp->v_flag & VWAIT)
+                        sv_broadcast(vptosync(vp));
+                vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED);
+        }
+        VN_UNLOCK(vp, 0);
+        vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address);
+}
+/*
+ * Finish the removal of a vnode.
+ */
+void
+vn_remove(
+        struct vnode    *vp)
+{
+        vmap_t          vmap;
+        /* Make sure we don't do this to the same vnode twice */
+        if (!(vp->v_fbhv))
+                return;
+        XFS_STATS_INC(vn_remove);
+        vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address);
+        /*
+         * After the following purge the vnode
+         * will no longer exist.
+         */
+        VMAP(vp, vmap);
+        vn_purge(vp, &vmap);
+}
+#ifdef  XFS_VNODE_TRACE
+#define KTRACE_ENTER(vp, vk, s, line, ra)                       \
+        ktrace_enter(   (vp)->v_trace,                          \
+/*  0 */                (void *)(__psint_t)(vk),                \
+/*  1 */                (void *)(s),                            \
+/*  2 */                (void *)(__psint_t) line,               \
+/*  3 */                (void *)(vn_count(vp)), \
+/*  4 */                (void *)(ra),                           \
+/*  5 */                (void *)(__psunsigned_t)(vp)->v_flag,   \
+/*  6 */                (void *)(__psint_t)current_cpu(),       \
+/*  7 */                (void *)(__psint_t)current_pid(),       \
+/*  8 */                (void *)__return_address,               \
+/*  9 */                0, 0, 0, 0, 0, 0, 0)
+/*
+ * Vnode tracing code.
+ */
+void
+vn_trace_entry(vnode_t *vp, char *func, inst_t *ra)
+{
+        KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
+}
+void
+vn_trace_exit(vnode_t *vp, char *func, inst_t *ra)
+{
+        KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
+}
+void
+vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
+}
+void
+vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
+}
+void
+vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
+}
+#endif  /* XFS_VNODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
new file mode 100644
index 000000000000..da76c1f1e11c
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -0,0 +1,666 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ *
+ * Portions Copyright (c) 1989, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef __XFS_VNODE_H__
+#define __XFS_VNODE_H__
+struct uio;
+struct file;
+struct vattr;
+struct xfs_iomap;
+struct attrlist_cursor_kern;
+/*
+ * Vnode types.  VNON means no type.
+ */
+enum vtype      { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VFIFO, VBAD, VSOCK };
+typedef xfs_ino_t vnumber_t;
+typedef struct dentry vname_t;
+typedef bhv_head_t vn_bhv_head_t;
+/*
+ * MP locking protocols:
+ *      v_flag, v_vfsp                          VN_LOCK/VN_UNLOCK
+ *      v_type                                  read-only or fs-dependent
+ */
+typedef struct vnode {
+        __u32           v_flag;                 /* vnode flags (see below) */
+        enum vtype      v_type;                 /* vnode type */
+        struct vfs      *v_vfsp;                /* ptr to containing VFS */
+        vnumber_t       v_number;               /* in-core vnode number */
+        vn_bhv_head_t   v_bh;                   /* behavior head */
+        spinlock_t      v_lock;                 /* VN_LOCK/VN_UNLOCK */
+        struct inode    v_inode;                /* Linux inode */
+#ifdef XFS_VNODE_TRACE
+        struct ktrace   *v_trace;               /* trace header structure    */
+#endif
+} vnode_t;
+#define v_fbhv                  v_bh.bh_first          /* first behavior */
+#define v_fops                  v_bh.bh_first->bd_ops  /* first behavior ops */
+#define VNODE_POSITION_BASE     BHV_POSITION_BASE       /* chain bottom */
+#define VNODE_POSITION_TOP      BHV_POSITION_TOP        /* chain top */
+#define VNODE_POSITION_INVALID  BHV_POSITION_INVALID    /* invalid pos. num */
+typedef enum {
+        VN_BHV_UNKNOWN,         /* not specified */
+        VN_BHV_XFS,             /* xfs */
+        VN_BHV_DM,              /* data migration */
+        VN_BHV_QM,              /* quota manager */
+        VN_BHV_IO,              /* IO path */
+        VN_BHV_END              /* housekeeping end-of-range */
+} vn_bhv_t;
+#define VNODE_POSITION_XFS      (VNODE_POSITION_BASE)
+#define VNODE_POSITION_DM       (VNODE_POSITION_BASE+10)
+#define VNODE_POSITION_QM       (VNODE_POSITION_BASE+20)
+#define VNODE_POSITION_IO       (VNODE_POSITION_BASE+30)
+/*
+ * Macros for dealing with the behavior descriptor inside of the vnode.
+ */
+#define BHV_TO_VNODE(bdp)       ((vnode_t *)BHV_VOBJ(bdp))
+#define BHV_TO_VNODE_NULL(bdp)  ((vnode_t *)BHV_VOBJNULL(bdp))
+#define VN_BHV_HEAD(vp)                 ((bhv_head_t *)(&((vp)->v_bh)))
+#define vn_bhv_head_init(bhp,name)      bhv_head_init(bhp,name)
+#define vn_bhv_remove(bhp,bdp)          bhv_remove(bhp,bdp)
+#define vn_bhv_lookup(bhp,ops)          bhv_lookup(bhp,ops)
+#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
+/*
+ * Vnode to Linux inode mapping.
+ */
+#define LINVFS_GET_VP(inode)    ((vnode_t *)list_entry(inode, vnode_t, v_inode))
+#define LINVFS_GET_IP(vp)       (&(vp)->v_inode)
+/*
+ * Convert between vnode types and inode formats (since POSIX.1
+ * defines mode word of stat structure in terms of inode formats).
+ */
+extern enum vtype       iftovt_tab[];
+extern u_short          vttoif_tab[];
+#define IFTOVT(mode)    (iftovt_tab[((mode) & S_IFMT) >> 12])
+#define VTTOIF(indx)    (vttoif_tab[(int)(indx)])
+#define MAKEIMODE(indx, mode)   (int)(VTTOIF(indx) | (mode))
+/*
+ * Vnode flags.
+ */
+#define VINACT                 0x1      /* vnode is being inactivated   */
+#define VRECLM                 0x2      /* vnode is being reclaimed     */
+#define VWAIT                  0x4      /* waiting for VINACT/VRECLM to end */
+#define VMODIFIED              0x8      /* XFS inode state possibly differs */
+                                        /* to the Linux inode state.    */
+/*
+ * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter.
+ */
+typedef enum vrwlock {
+        VRWLOCK_NONE,
+        VRWLOCK_READ,
+        VRWLOCK_WRITE,
+        VRWLOCK_WRITE_DIRECT,
+        VRWLOCK_TRY_READ,
+        VRWLOCK_TRY_WRITE
+} vrwlock_t;
+/*
+ * Return values for VOP_INACTIVE.  A return value of
+ * VN_INACTIVE_NOCACHE implies that the file system behavior
+ * has disassociated its state and bhv_desc_t from the vnode.
+ */
+#define VN_INACTIVE_CACHE       0
+#define VN_INACTIVE_NOCACHE     1
+/*
+ * Values for the cmd code given to VOP_VNODE_CHANGE.
+ */
+typedef enum vchange {
+        VCHANGE_FLAGS_FRLOCKS           = 0,
+        VCHANGE_FLAGS_ENF_LOCKING       = 1,
+        VCHANGE_FLAGS_TRUNCATED         = 2,
+        VCHANGE_FLAGS_PAGE_DIRTY        = 3,
+        VCHANGE_FLAGS_IOEXCL_COUNT      = 4
+} vchange_t;
+typedef int     (*vop_open_t)(bhv_desc_t *, struct cred *);
+typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *,
+                                const struct iovec *, unsigned int,
+                                loff_t *, int, struct cred *);
+typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
+                                const struct iovec *, unsigned int,
+                                loff_t *, int, struct cred *);
+typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
+                                loff_t *, int, size_t, read_actor_t,
+                                void *, struct cred *);
+typedef int     (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
+                                int, unsigned int, void __user *);
+typedef int     (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
+                                struct cred *);
+typedef int     (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
+                                struct cred *);
+typedef int     (*vop_access_t)(bhv_desc_t *, int, struct cred *);
+typedef int     (*vop_lookup_t)(bhv_desc_t *, vname_t *, vnode_t **,
+                                int, vnode_t *, struct cred *);
+typedef int     (*vop_create_t)(bhv_desc_t *, vname_t *, struct vattr *,
+                                vnode_t **, struct cred *);
+typedef int     (*vop_remove_t)(bhv_desc_t *, vname_t *, struct cred *);
+typedef int     (*vop_link_t)(bhv_desc_t *, vnode_t *, vname_t *,
+                                struct cred *);
+typedef int     (*vop_rename_t)(bhv_desc_t *, vname_t *, vnode_t *, vname_t *,
+                                struct cred *);
+typedef int     (*vop_mkdir_t)(bhv_desc_t *, vname_t *, struct vattr *,
+                                vnode_t **, struct cred *);
+typedef int     (*vop_rmdir_t)(bhv_desc_t *, vname_t *, struct cred *);
+typedef int     (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
+                                int *);
+typedef int     (*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *,
+                                char *, vnode_t **, struct cred *);
+typedef int     (*vop_readlink_t)(bhv_desc_t *, struct uio *, int,
+                                struct cred *);
+typedef int     (*vop_fsync_t)(bhv_desc_t *, int, struct cred *,
+                                xfs_off_t, xfs_off_t);
+typedef int     (*vop_inactive_t)(bhv_desc_t *, struct cred *);
+typedef int     (*vop_fid2_t)(bhv_desc_t *, struct fid *);
+typedef int     (*vop_release_t)(bhv_desc_t *);
+typedef int     (*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
+typedef void    (*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
+typedef int     (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int,
+                                struct xfs_iomap *, int *);
+typedef int     (*vop_reclaim_t)(bhv_desc_t *);
+typedef int     (*vop_attr_get_t)(bhv_desc_t *, char *, char *, int *, int,
+                                struct cred *);
+typedef int     (*vop_attr_set_t)(bhv_desc_t *, char *, char *, int, int,
+                                struct cred *);
+typedef int     (*vop_attr_remove_t)(bhv_desc_t *, char *, int, struct cred *);
+typedef int     (*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
+                                struct attrlist_cursor_kern *, struct cred *);
+typedef void    (*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
+typedef void    (*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
+typedef void    (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef void    (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef int     (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t,
+                                uint64_t, int);
+typedef int     (*vop_iflush_t)(bhv_desc_t *, int);
+typedef struct vnodeops {
+        bhv_position_t  vn_position;    /* position within behavior chain */
+        vop_open_t              vop_open;
+        vop_read_t              vop_read;
+        vop_write_t             vop_write;
+        vop_sendfile_t          vop_sendfile;
+        vop_ioctl_t             vop_ioctl;
+        vop_getattr_t           vop_getattr;
+        vop_setattr_t           vop_setattr;
+        vop_access_t            vop_access;
+        vop_lookup_t            vop_lookup;
+        vop_create_t            vop_create;
+        vop_remove_t            vop_remove;
+        vop_link_t              vop_link;
+        vop_rename_t            vop_rename;
+        vop_mkdir_t             vop_mkdir;
+        vop_rmdir_t             vop_rmdir;
+        vop_readdir_t           vop_readdir;
+        vop_symlink_t           vop_symlink;
+        vop_readlink_t          vop_readlink;
+        vop_fsync_t             vop_fsync;
+        vop_inactive_t          vop_inactive;
+        vop_fid2_t              vop_fid2;
+        vop_rwlock_t            vop_rwlock;
+        vop_rwunlock_t          vop_rwunlock;
+        vop_bmap_t              vop_bmap;
+        vop_reclaim_t           vop_reclaim;
+        vop_attr_get_t          vop_attr_get;
+        vop_attr_set_t          vop_attr_set;
+        vop_attr_remove_t       vop_attr_remove;
+        vop_attr_list_t         vop_attr_list;
+        vop_link_removed_t      vop_link_removed;
+        vop_vnode_change_t      vop_vnode_change;
+        vop_ptossvp_t           vop_tosspages;
+        vop_pflushinvalvp_t     vop_flushinval_pages;
+        vop_pflushvp_t          vop_flush_pages;
+        vop_release_t           vop_release;
+        vop_iflush_t            vop_iflush;
+} vnodeops_t;
+/*
+ * VOP's.
+ */
+#define _VOP_(op, vp)   (*((vnodeops_t *)(vp)->v_fops)->op)
+#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv)                 \
+        rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
+#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv)                \
+        rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
+#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)               \
+        rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
+#define VOP_BMAP(vp,of,sz,rw,b,n,rv)                                    \
+        rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
+#define VOP_OPEN(vp, cr, rv)                                            \
+        rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr)
+#define VOP_GETATTR(vp, vap, f, cr, rv)                                 \
+        rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr)
+#define VOP_SETATTR(vp, vap, f, cr, rv)                                 \
+        rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr)
+#define VOP_ACCESS(vp, mode, cr, rv)                                    \
+        rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr)
+#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv)                               \
+        rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr)
+#define VOP_CREATE(dvp,d,vap,vpp,cr,rv)                                 \
+        rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr)
+#define VOP_REMOVE(dvp,d,cr,rv)                                         \
+        rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr)
+#define VOP_LINK(tdvp,fvp,d,cr,rv)                                      \
+        rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr)
+#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv)                              \
+        rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr)
+#define VOP_MKDIR(dp,d,vap,vpp,cr,rv)                                   \
+        rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr)
+#define VOP_RMDIR(dp,d,cr,rv)                                           \
+        rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr)
+#define VOP_READDIR(vp,uiop,cr,eofp,rv)                                 \
+        rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp)
+#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv)                            \
+        rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr)
+#define VOP_READLINK(vp,uiop,fl,cr,rv)                                  \
+        rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr)
+#define VOP_FSYNC(vp,f,cr,b,e,rv)                                       \
+        rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e)
+#define VOP_INACTIVE(vp, cr, rv)                                        \
+        rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr)
+#define VOP_RELEASE(vp, rv)                                             \
+        rv = _VOP_(vop_release, vp)((vp)->v_fbhv)
+#define VOP_FID2(vp, fidp, rv)                                          \
+        rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp)
+#define VOP_RWLOCK(vp,i)                                                \
+        (void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
+#define VOP_RWLOCK_TRY(vp,i)                                            \
+        _VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
+#define VOP_RWUNLOCK(vp,i)                                              \
+        (void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i)
+#define VOP_FRLOCK(vp,c,fl,flags,offset,fr,rv)                          \
+        rv = _VOP_(vop_frlock, vp)((vp)->v_fbhv,c,fl,flags,offset,fr)
+#define VOP_RECLAIM(vp, rv)                                             \
+        rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv)
+#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv)              \
+        rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred)
+#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv)               \
+        rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred)
+#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv)                      \
+        rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred)
+#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv)            \
+        rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred)
+#define VOP_LINK_REMOVED(vp, dvp, linkzero)                             \
+        (void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero)
+#define VOP_VNODE_CHANGE(vp, cmd, val)                                  \
+        (void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val)
+/*
+ * These are page cache functions that now go thru VOPs.
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_TOSS_PAGES(vp, first, last, fiopt)                          \
+        _VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt)
+/*
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt)                    \
+        _VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt)
+/*
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv)              \
+        rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt)
+#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv)                    \
+        rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg)
+#define VOP_IFLUSH(vp, flags, rv)                                       \
+        rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags)
+/*
+ * Flags for read/write calls - same values as IRIX
+ */
+#define IO_ISAIO        0x00001         /* don't wait for completion */
+#define IO_ISDIRECT     0x00004         /* bypass page cache */
+#define IO_INVIS        0x00020         /* don't update inode timestamps */
+/*
+ * Flags for VOP_IFLUSH call
+ */
+#define FLUSH_SYNC              1       /* wait for flush to complete   */
+#define FLUSH_INODE             2       /* flush the inode itself       */
+#define FLUSH_LOG               4       /* force the last log entry for
+                                         * this inode out to disk       */
+/*
+ * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
+ *      VOP_FLUSH_PAGES.
+ */
+#define FI_NONE                 0       /* none */
+#define FI_REMAPF               1       /* Do a remapf prior to the operation */
+#define FI_REMAPF_LOCKED        2       /* Do a remapf prior to the operation.
+                                           Prevent VM access to the pages until
+                                           the operation completes. */
+/*
+ * Vnode attributes.  va_mask indicates those attributes the caller
+ * wants to set or extract.
+ */
+typedef struct vattr {
+        int             va_mask;        /* bit-mask of attributes present */
+        enum vtype      va_type;        /* vnode type (for create) */
+        mode_t          va_mode;        /* file access mode and type */
+        nlink_t         va_nlink;       /* number of references to file */
+        uid_t           va_uid;         /* owner user id */
+        gid_t           va_gid;         /* owner group id */
+        xfs_ino_t       va_nodeid;      /* file id */
+        xfs_off_t       va_size;        /* file size in bytes */
+        u_long          va_blocksize;   /* blocksize preferred for i/o */
+        struct timespec va_atime;       /* time of last access */
+        struct timespec va_mtime;       /* time of last modification */
+        struct timespec va_ctime;       /* time file changed */
+        u_int           va_gen;         /* generation number of file */
+        xfs_dev_t       va_rdev;        /* device the special file represents */
+        __int64_t       va_nblocks;     /* number of blocks allocated */
+        u_long          va_xflags;      /* random extended file flags */
+        u_long          va_extsize;     /* file extent size */
+        u_long          va_nextents;    /* number of extents in file */
+        u_long          va_anextents;   /* number of attr extents in file */
+        int             va_projid;      /* project id */
+} vattr_t;
+/*
+ * setattr or getattr attributes
+ */
+#define XFS_AT_TYPE             0x00000001
+#define XFS_AT_MODE             0x00000002
+#define XFS_AT_UID              0x00000004
+#define XFS_AT_GID              0x00000008
+#define XFS_AT_FSID             0x00000010
+#define XFS_AT_NODEID           0x00000020
+#define XFS_AT_NLINK            0x00000040
+#define XFS_AT_SIZE             0x00000080
+#define XFS_AT_ATIME            0x00000100
+#define XFS_AT_MTIME            0x00000200
+#define XFS_AT_CTIME            0x00000400
+#define XFS_AT_RDEV             0x00000800
+#define XFS_AT_BLKSIZE          0x00001000
+#define XFS_AT_NBLOCKS          0x00002000
+#define XFS_AT_VCODE            0x00004000
+#define XFS_AT_MAC              0x00008000
+#define XFS_AT_UPDATIME         0x00010000
+#define XFS_AT_UPDMTIME         0x00020000
+#define XFS_AT_UPDCTIME         0x00040000
+#define XFS_AT_ACL              0x00080000
+#define XFS_AT_CAP              0x00100000
+#define XFS_AT_INF              0x00200000
+#define XFS_AT_XFLAGS           0x00400000
+#define XFS_AT_EXTSIZE          0x00800000
+#define XFS_AT_NEXTENTS         0x01000000
+#define XFS_AT_ANEXTENTS        0x02000000
+#define XFS_AT_PROJID           0x04000000
+#define XFS_AT_SIZE_NOPERM      0x08000000
+#define XFS_AT_GENCOUNT         0x10000000
+#define XFS_AT_ALL      (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
+                XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
+                XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
+                XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\
+                XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\
+                XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT)
+#define XFS_AT_STAT     (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
+                XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
+                XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
+                XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID)
+#define XFS_AT_TIMES    (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME)
+#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME)
+#define XFS_AT_NOSET    (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\
+                XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
+                XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
+/*
+ *  Modes.
+ */
+#define VSUID   S_ISUID         /* set user id on execution */
+#define VSGID   S_ISGID         /* set group id on execution */
+#define VSVTX   S_ISVTX         /* save swapped text even after use */
+#define VREAD   S_IRUSR         /* read, write, execute permissions */
+#define VWRITE  S_IWUSR
+#define VEXEC   S_IXUSR
+#define MODEMASK S_IALLUGO      /* mode bits plus permission bits */
+/*
+ * Check whether mandatory file locking is enabled.
+ */
+#define MANDLOCK(vp, mode)      \
+        ((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
+extern void     vn_init(void);
+extern int      vn_wait(struct vnode *);
+extern vnode_t  *vn_initialize(struct inode *);
+/*
+ * Acquiring and invalidating vnodes:
+ *
+ *      if (vn_get(vp, version, 0))
+ *              ...;
+ *      vn_purge(vp, version);
+ *
+ * vn_get and vn_purge must be called with vmap_t arguments, sampled
+ * while a lock that the vnode's VOP_RECLAIM function acquires is
+ * held, to ensure that the vnode sampled with the lock held isn't
+ * recycled (VOP_RECLAIMed) or deallocated between the release of the lock
+ * and the subsequent vn_get or vn_purge.
+ */
+/*
+ * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
+ */
+typedef struct vnode_map {
+        vfs_t           *v_vfsp;
+        vnumber_t       v_number;               /* in-core vnode number */
+        xfs_ino_t       v_ino;                  /* inode #      */
+} vmap_t;
+#define VMAP(vp, vmap)  {(vmap).v_vfsp   = (vp)->v_vfsp,        \
+                         (vmap).v_number = (vp)->v_number,      \
+                         (vmap).v_ino    = (vp)->v_inode.i_ino; }
+extern void     vn_purge(struct vnode *, vmap_t *);
+extern vnode_t  *vn_get(struct vnode *, vmap_t *);
+extern int      vn_revalidate(struct vnode *);
+extern void     vn_revalidate_core(struct vnode *, vattr_t *);
+extern void     vn_remove(struct vnode *);
+static inline int vn_count(struct vnode *vp)
+{
+        return atomic_read(&LINVFS_GET_IP(vp)->i_count);
+}
+/*
+ * Vnode reference counting functions (and macros for compatibility).
+ */
+extern vnode_t  *vn_hold(struct vnode *);
+extern void     vn_rele(struct vnode *);
+#if defined(XFS_VNODE_TRACE)
+#define VN_HOLD(vp)             \
+        ((void)vn_hold(vp),     \
+          vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address))
+#define VN_RELE(vp)             \
+          (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \
+           iput(LINVFS_GET_IP(vp)))
+#else
+#define VN_HOLD(vp)             ((void)vn_hold(vp))
+#define VN_RELE(vp)             (iput(LINVFS_GET_IP(vp)))
+#endif
+/*
+ * Vname handling macros.
+ */
+#define VNAME(dentry)           ((char *) (dentry)->d_name.name)
+#define VNAMELEN(dentry)        ((dentry)->d_name.len)
+#define VNAME_TO_VNODE(dentry)  (LINVFS_GET_VP((dentry)->d_inode))
+/*
+ * Vnode spinlock manipulation.
+ */
+#define VN_LOCK(vp)             mutex_spinlock(&(vp)->v_lock)
+#define VN_UNLOCK(vp, s)        mutex_spinunlock(&(vp)->v_lock, s)
+#define VN_FLAGSET(vp,b)        vn_flagset(vp,b)
+#define VN_FLAGCLR(vp,b)        vn_flagclr(vp,b)
+static __inline__ void vn_flagset(struct vnode *vp, uint flag)
+{
+        spin_lock(&vp->v_lock);
+        vp->v_flag |= flag;
+        spin_unlock(&vp->v_lock);
+}
+static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
+{
+        spin_lock(&vp->v_lock);
+        vp->v_flag &= ~flag;
+        spin_unlock(&vp->v_lock);
+}
+/*
+ * Update modify/access/change times on the vnode
+ */
+#define VN_MTIMESET(vp, tvp)    (LINVFS_GET_IP(vp)->i_mtime = *(tvp))
+#define VN_ATIMESET(vp, tvp)    (LINVFS_GET_IP(vp)->i_atime = *(tvp))
+#define VN_CTIMESET(vp, tvp)    (LINVFS_GET_IP(vp)->i_ctime = *(tvp))
+/*
+ * Dealing with bad inodes
+ */
+static inline void vn_mark_bad(struct vnode *vp)
+{
+        make_bad_inode(LINVFS_GET_IP(vp));
+}
+static inline int VN_BAD(struct vnode *vp)
+{
+        return is_bad_inode(LINVFS_GET_IP(vp));
+}
+/*
+ * Some useful predicates.
+ */
+#define VN_MAPPED(vp)   mapping_mapped(LINVFS_GET_IP(vp)->i_mapping)
+#define VN_CACHED(vp)   (LINVFS_GET_IP(vp)->i_mapping->nrpages)
+#define VN_DIRTY(vp)    mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
+                                        PAGECACHE_TAG_DIRTY)
+#define VMODIFY(vp)     VN_FLAGSET(vp, VMODIFIED)
+#define VUNMODIFY(vp)   VN_FLAGCLR(vp, VMODIFIED)
+/*
+ * Flags to VOP_SETATTR/VOP_GETATTR.
+ */
+#define ATTR_UTIME      0x01    /* non-default utime(2) request */
+#define ATTR_DMI        0x08    /* invocation from a DMI function */
+#define ATTR_LAZY       0x80    /* set/get attributes lazily */
+#define ATTR_NONBLOCK   0x100   /* return EAGAIN if operation would block */
+/*
+ * Flags to VOP_FSYNC and VOP_RECLAIM.
+ */
+#define FSYNC_NOWAIT    0       /* asynchronous flush */
+#define FSYNC_WAIT      0x1     /* synchronous fsync or forced reclaim */
+#define FSYNC_INVAL     0x2     /* flush and invalidate cached data */
+#define FSYNC_DATA      0x4     /* synchronous fsync of data only */
+/*
+ * Tracking vnode activity.
+ */
+#if defined(XFS_VNODE_TRACE)
+#define VNODE_TRACE_SIZE        16              /* number of trace entries */
+#define VNODE_KTRACE_ENTRY      1
+#define VNODE_KTRACE_EXIT       2
+#define VNODE_KTRACE_HOLD       3
+#define VNODE_KTRACE_REF        4
+#define VNODE_KTRACE_RELE       5
+extern void vn_trace_entry(struct vnode *, char *, inst_t *);
+extern void vn_trace_exit(struct vnode *, char *, inst_t *);
+extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
+#define VN_TRACE(vp)            \
+        vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
+#else
+#define vn_trace_entry(a,b,c)
+#define vn_trace_exit(a,b,c)
+#define vn_trace_hold(a,b,c,d)
+#define vn_trace_ref(a,b,c,d)
+#define vn_trace_rele(a,b,c,d)
+#define VN_TRACE(vp)
+#endif
+#endif  /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
new file mode 100644
index 000000000000..740d20d33187
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -0,0 +1,1648 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+/*
+   LOCK ORDER
+   inode lock               (ilock)
+   dquot hash-chain lock    (hashlock)
+   xqm dquot freelist lock  (freelistlock
+   mount's dquot list lock  (mplistlock)
+   user dquot lock - lock ordering among dquots is based on the uid or gid
+   group dquot lock - similar to udquots. Between the two dquots, the udquot
+                      has to be locked first.
+   pin lock - the dquot lock must be held to take this lock.
+   flush lock - ditto.
+*/
+STATIC void             xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
+#ifdef DEBUG
+xfs_buftarg_t *xfs_dqerror_target;
+int xfs_do_dqerror;
+int xfs_dqreq_num;
+int xfs_dqerror_mod = 33;
+#endif
+/*
+ * Allocate and initialize a dquot. We don't always allocate fresh memory;
+ * we try to reclaim a free dquot if the number of incore dquots are above
+ * a threshold.
+ * The only field inside the core that gets initialized at this point
+ * is the d_id field. The idea is to fill in the entire q_core
+ * when we read in the on disk dquot.
+ */
+xfs_dquot_t *
+xfs_qm_dqinit(
+        xfs_mount_t  *mp,
+        xfs_dqid_t   id,
+        uint         type)
+{
+        xfs_dquot_t     *dqp;
+        boolean_t       brandnewdquot;
+        brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
+        dqp->dq_flags = type;
+        INT_SET(dqp->q_core.d_id, ARCH_CONVERT, id);
+        dqp->q_mount = mp;
+        /*
+         * No need to re-initialize these if this is a reclaimed dquot.
+         */
+        if (brandnewdquot) {
+                dqp->dq_flnext = dqp->dq_flprev = dqp;
+                mutex_init(&dqp->q_qlock,  MUTEX_DEFAULT, "xdq");
+                initnsema(&dqp->q_flock, 1, "fdq");
+                sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+#ifdef XFS_DQUOT_TRACE
+                dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+                xfs_dqtrace_entry(dqp, "DQINIT");
+#endif
+        } else {
+                /*
+                 * Only the q_core portion was zeroed in dqreclaim_one().
+                 * So, we need to reset others.
+                 */
+                 dqp->q_nrefs = 0;
+                 dqp->q_blkno = 0;
+                 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
+                 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
+                 dqp->q_bufoffset = 0;
+                 dqp->q_fileoffset = 0;
+                 dqp->q_transp = NULL;
+                 dqp->q_gdquot = NULL;
+                 dqp->q_res_bcount = 0;
+                 dqp->q_res_icount = 0;
+                 dqp->q_res_rtbcount = 0;
+                 dqp->q_pincount = 0;
+                 dqp->q_hash = NULL;
+                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+#ifdef XFS_DQUOT_TRACE
+                 ASSERT(dqp->q_trace);
+                 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
+#endif
+         }
+        /*
+         * log item gets initialized later
+         */
+        return (dqp);
+}
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+        xfs_dquot_t     *dqp)
+{
+        ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+        mutex_destroy(&dqp->q_qlock);
+        freesema(&dqp->q_flock);
+        sv_destroy(&dqp->q_pinwait);
+#ifdef XFS_DQUOT_TRACE
+        if (dqp->q_trace)
+             ktrace_free(dqp->q_trace);
+        dqp->q_trace = NULL;
+#endif
+        kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+        atomic_dec(&xfs_Gqm->qm_totaldquots);
+}
+/*
+ * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
+ */
+STATIC void
+xfs_qm_dqinit_core(
+        xfs_dqid_t      id,
+        uint            type,
+        xfs_dqblk_t     *d)
+{
+        /*
+         * Caller has zero'd the entire dquot 'chunk' already.
+         */
+        INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC);
+        INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION);
+        INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id);
+        INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type);
+}
+#ifdef XFS_DQUOT_TRACE
+/*
+ * Dquot tracing for debugging.
+ */
+/* ARGSUSED */
+void
+__xfs_dqtrace_entry(
+        xfs_dquot_t     *dqp,
+        char            *func,
+        void            *retaddr,
+        xfs_inode_t     *ip)
+{
+        xfs_dquot_t     *udqp = NULL;
+        xfs_ino_t       ino = 0;
+        ASSERT(dqp->q_trace);
+        if (ip) {
+                ino = ip->i_ino;
+                udqp = ip->i_udquot;
+        }
+        ktrace_enter(dqp->q_trace,
+                     (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
+                     (void *)func,
+                     (void *)(__psint_t)dqp->q_nrefs,
+                     (void *)(__psint_t)dqp->dq_flags,
+                     (void *)(__psint_t)dqp->q_res_bcount,
+                     (void *)(__psint_t)INT_GET(dqp->q_core.d_bcount,
+                                                ARCH_CONVERT),
+                     (void *)(__psint_t)INT_GET(dqp->q_core.d_icount,
+                                                ARCH_CONVERT),
+                     (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_hardlimit,
+                                                ARCH_CONVERT),
+                     (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_softlimit,
+                                                ARCH_CONVERT),
+                     (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_hardlimit,
+                                                ARCH_CONVERT),
+                     (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_softlimit,
+                                                ARCH_CONVERT),
+                     (void *)(__psint_t)INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
+                     (void *)(__psint_t)current_pid(),
+                     (void *)(__psint_t)ino,
+                     (void *)(__psint_t)retaddr,
+                     (void *)(__psint_t)udqp);
+        return;
+}
+#endif
+/*
+ * If default limits are in force, push them into the dquot now.
+ * We overwrite the dquot limits only if they are zero and this
+ * is not the root dquot.
+ */
+void
+xfs_qm_adjust_dqlimits(
+        xfs_mount_t             *mp,
+        xfs_disk_dquot_t        *d)
+{
+        xfs_quotainfo_t         *q = mp->m_quotainfo;
+        ASSERT(d->d_id);
+        if (q->qi_bsoftlimit && !d->d_blk_softlimit)
+                INT_SET(d->d_blk_softlimit, ARCH_CONVERT, q->qi_bsoftlimit);
+        if (q->qi_bhardlimit && !d->d_blk_hardlimit)
+                INT_SET(d->d_blk_hardlimit, ARCH_CONVERT, q->qi_bhardlimit);
+        if (q->qi_isoftlimit && !d->d_ino_softlimit)
+                INT_SET(d->d_ino_softlimit, ARCH_CONVERT, q->qi_isoftlimit);
+        if (q->qi_ihardlimit && !d->d_ino_hardlimit)
+                INT_SET(d->d_ino_hardlimit, ARCH_CONVERT, q->qi_ihardlimit);
+        if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
+                INT_SET(d->d_rtb_softlimit, ARCH_CONVERT, q->qi_rtbsoftlimit);
+        if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
+                INT_SET(d->d_rtb_hardlimit, ARCH_CONVERT, q->qi_rtbhardlimit);
+}
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded.
+ */
+void
+xfs_qm_adjust_dqtimers(
+        xfs_mount_t             *mp,
+        xfs_disk_dquot_t        *d)
+{
+        ASSERT(d->d_id);
+#ifdef QUOTADEBUG
+        if (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT))
+                ASSERT(INT_GET(d->d_blk_softlimit, ARCH_CONVERT) <=
+                        INT_GET(d->d_blk_hardlimit, ARCH_CONVERT));
+        if (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT))
+                ASSERT(INT_GET(d->d_ino_softlimit, ARCH_CONVERT) <=
+                        INT_GET(d->d_ino_hardlimit, ARCH_CONVERT));
+        if (INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT))
+                ASSERT(INT_GET(d->d_rtb_softlimit, ARCH_CONVERT) <=
+                        INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT));
+#endif
+        if (!d->d_btimer) {
+                if ((INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
+                    (INT_GET(d->d_bcount, ARCH_CONVERT) >=
+                                INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) ||
+                    (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT) &&
+                    (INT_GET(d->d_bcount, ARCH_CONVERT) >=
+                                INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
+                        INT_SET(d->d_btimer, ARCH_CONVERT,
+                                get_seconds() + XFS_QI_BTIMELIMIT(mp));
+                }
+        } else {
+                if ((!d->d_blk_softlimit ||
+                    (INT_GET(d->d_bcount, ARCH_CONVERT) <
+                                INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) &&
+                    (!d->d_blk_hardlimit ||
+                    (INT_GET(d->d_bcount, ARCH_CONVERT) <
+                                INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
+                        d->d_btimer = 0;
+                }
+        }
+        if (!d->d_itimer) {
+                if ((INT_GET(d->d_ino_softlimit, ARCH_CONVERT) &&
+                    (INT_GET(d->d_icount, ARCH_CONVERT) >=
+                                INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) ||
+                    (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT) &&
+                    (INT_GET(d->d_icount, ARCH_CONVERT) >=
+                                INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
+                        INT_SET(d->d_itimer, ARCH_CONVERT,
+                                get_seconds() + XFS_QI_ITIMELIMIT(mp));
+                }
+        } else {
+                if ((!d->d_ino_softlimit ||
+                    (INT_GET(d->d_icount, ARCH_CONVERT) <
+                                INT_GET(d->d_ino_softlimit, ARCH_CONVERT)))  &&
+                    (!d->d_ino_hardlimit ||
+                    (INT_GET(d->d_icount, ARCH_CONVERT) <
+                                INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
+                        d->d_itimer = 0;
+                }
+        }
+        if (!d->d_rtbtimer) {
+                if ((INT_GET(d->d_rtb_softlimit, ARCH_CONVERT) &&
+                    (INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
+                                INT_GET(d->d_rtb_softlimit, ARCH_CONVERT))) ||
+                    (INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT) &&
+                    (INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
+                                INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)))) {
+                        INT_SET(d->d_rtbtimer, ARCH_CONVERT,
+                                get_seconds() + XFS_QI_RTBTIMELIMIT(mp));
+                }
+        } else {
+                if ((!d->d_rtb_softlimit ||
+                    (INT_GET(d->d_rtbcount, ARCH_CONVERT) <
+                                INT_GET(d->d_rtb_softlimit, ARCH_CONVERT))) &&
+                    (!d->d_rtb_hardlimit ||
+                    (INT_GET(d->d_rtbcount, ARCH_CONVERT) <
+                                INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)))) {
+                        d->d_rtbtimer = 0;
+                }
+        }
+}
+/*
+ * Increment or reset warnings of a given dquot.
+ */
+int
+xfs_qm_dqwarn(
+        xfs_disk_dquot_t        *d,
+        uint                    flags)
+{
+        int     warned;
+        /*
+         * root's limits are not real limits.
+         */
+        if (!d->d_id)
+                return (0);
+        warned = 0;
+        if (INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
+            (INT_GET(d->d_bcount, ARCH_CONVERT) >=
+             INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
+                if (flags & XFS_QMOPT_DOWARN) {
+                        INT_MOD(d->d_bwarns, ARCH_CONVERT, +1);
+                        warned++;
+                }
+        } else {
+                if (!d->d_blk_softlimit ||
+                    (INT_GET(d->d_bcount, ARCH_CONVERT) <
+                     INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
+                        d->d_bwarns = 0;
+                }
+        }
+        if (INT_GET(d->d_ino_softlimit, ARCH_CONVERT) > 0 &&
+            (INT_GET(d->d_icount, ARCH_CONVERT) >=
+             INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
+                if (flags & XFS_QMOPT_DOWARN) {
+                        INT_MOD(d->d_iwarns, ARCH_CONVERT, +1);
+                        warned++;
+                }
+        } else {
+                if (!d->d_ino_softlimit ||
+                    (INT_GET(d->d_icount, ARCH_CONVERT) <
+                     INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
+                        d->d_iwarns = 0;
+                }
+        }
+#ifdef QUOTADEBUG
+        if (INT_GET(d->d_iwarns, ARCH_CONVERT))
+                cmn_err(CE_DEBUG,
+                        "--------@@Inode warnings running : %Lu >= %Lu",
+                        INT_GET(d->d_icount, ARCH_CONVERT),
+                        INT_GET(d->d_ino_softlimit, ARCH_CONVERT));
+        if (INT_GET(d->d_bwarns, ARCH_CONVERT))
+                cmn_err(CE_DEBUG,
+                        "--------@@Blks warnings running : %Lu >= %Lu",
+                        INT_GET(d->d_bcount, ARCH_CONVERT),
+                        INT_GET(d->d_blk_softlimit, ARCH_CONVERT));
+#endif
+        return (warned);
+}
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+        xfs_trans_t     *tp,
+        xfs_mount_t     *mp,
+        xfs_dqid_t      id,
+        uint            type,
+        xfs_buf_t       *bp)
+{
+        xfs_dqblk_t     *d;
+        int             curid, i;
+        ASSERT(tp);
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+        d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
+        /*
+         * ID of the first dquot in the block - id's are zero based.
+         */
+        curid = id - (id % XFS_QM_DQPERBLK(mp));
+        ASSERT(curid >= 0);
+        memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
+        for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+                xfs_qm_dqinit_core(curid, type, d);
+        xfs_trans_dquot_buf(tp, bp,
+                            type & XFS_DQ_USER ?
+                            XFS_BLI_UDQUOT_BUF :
+                            XFS_BLI_GDQUOT_BUF);
+        xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+}
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+        xfs_trans_t     *tp,
+        xfs_mount_t     *mp,
+        xfs_dquot_t     *dqp,
+        xfs_inode_t     *quotip,
+        xfs_fileoff_t   offset_fsb,
+        xfs_buf_t       **O_bpp)
+{
+        xfs_fsblock_t   firstblock;
+        xfs_bmap_free_t flist;
+        xfs_bmbt_irec_t map;
+        int             nmaps, error, committed;
+        xfs_buf_t       *bp;
+        ASSERT(tp != NULL);
+        xfs_dqtrace_entry(dqp, "DQALLOC");
+        /*
+         * Initialize the bmap freelist prior to calling bmapi code.
+         */
+        XFS_BMAP_INIT(&flist, &firstblock);
+        xfs_ilock(quotip, XFS_ILOCK_EXCL);
+        /*
+         * Return if this type of quotas is turned off while we didn't
+         * have an inode lock
+         */
+        if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+                xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+                return (ESRCH);
+        }
+        /*
+         * xfs_trans_commit normally decrements the vnode ref count
+         * when it unlocks the inode. Since we want to keep the quota
+         * inode around, we bump the vnode ref count now.
+         */
+        VN_HOLD(XFS_ITOV(quotip));
+        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+        nmaps = 1;
+        if ((error = xfs_bmapi(tp, quotip,
+                              offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
+                              XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
+                              &firstblock,
+                              XFS_QM_DQALLOC_SPACE_RES(mp),
+                              &map, &nmaps, &flist))) {
+                goto error0;
+        }
+        ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+        ASSERT(nmaps == 1);
+        ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+               (map.br_startblock != HOLESTARTBLOCK));
+        /*
+         * Keep track of the blkno to save a lookup later
+         */
+        dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+        /* now we can just get the buffer (there's nothing to read yet) */
+        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+                               dqp->q_blkno,
+                               XFS_QI_DQCHUNKLEN(mp),
+                               0);
+        if (!bp || (error = XFS_BUF_GETERROR(bp)))
+                goto error1;
+        /*
+         * Make a chunk of dquots out of this buffer and log
+         * the entire thing.
+         */
+        xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
+                              dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
+                              bp);
+        if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) {
+                goto error1;
+        }
+        *O_bpp = bp;
+        return 0;
+      error1:
+        xfs_bmap_cancel(&flist);
+      error0:
+        xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+        return (error);
+}
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+        xfs_trans_t             *tp,
+        xfs_dquot_t             *dqp,
+        xfs_disk_dquot_t        **O_ddpp,
+        xfs_buf_t               **O_bpp,
+        uint                    flags)
+{
+        xfs_bmbt_irec_t map;
+        int             nmaps, error;
+        xfs_buf_t       *bp;
+        xfs_inode_t     *quotip;
+        xfs_mount_t     *mp;
+        xfs_disk_dquot_t *ddq;
+        xfs_dqid_t      id;
+        boolean_t       newdquot;
+        mp = dqp->q_mount;
+        id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
+        nmaps = 1;
+        newdquot = B_FALSE;
+        /*
+         * If we don't know where the dquot lives, find out.
+         */
+        if (dqp->q_blkno == (xfs_daddr_t) 0) {
+                /* We use the id as an index */
+                dqp->q_fileoffset = (xfs_fileoff_t) ((uint)id /
+                                                     XFS_QM_DQPERBLK(mp));
+                nmaps = 1;
+                quotip = XFS_DQ_TO_QIP(dqp);
+                xfs_ilock(quotip, XFS_ILOCK_SHARED);
+                /*
+                 * Return if this type of quotas is turned off while we didn't
+                 * have an inode lock
+                 */
+                if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+                        xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                        return (ESRCH);
+                }
+                /*
+                 * Find the block map; no allocations yet
+                 */
+                error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                                  XFS_DQUOT_CLUSTER_SIZE_FSB,
+                                  XFS_BMAPI_METADATA,
+                                  NULL, 0, &map, &nmaps, NULL);
+                xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+                if (error)
+                        return (error);
+                ASSERT(nmaps == 1);
+                ASSERT(map.br_blockcount == 1);
+                /*
+                 * offset of dquot in the (fixed sized) dquot chunk.
+                 */
+                dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+                        sizeof(xfs_dqblk_t);
+                if (map.br_startblock == HOLESTARTBLOCK) {
+                        /*
+                         * We don't allocate unless we're asked to
+                         */
+                        if (!(flags & XFS_QMOPT_DQALLOC))
+                                return (ENOENT);
+                        ASSERT(tp);
+                        if ((error = xfs_qm_dqalloc(tp, mp, dqp, quotip,
+                                                dqp->q_fileoffset, &bp)))
+                                return (error);
+                        newdquot = B_TRUE;
+                } else {
+                        /*
+                         * store the blkno etc so that we don't have to do the
+                         * mapping all the time
+                         */
+                        dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+                }
+        }
+        ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
+        ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
+        /*
+         * Read in the buffer, unless we've just done the allocation
+         * (in which case we already have the buf).
+         */
+        if (! newdquot) {
+                xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
+                if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                               dqp->q_blkno,
+                                               XFS_QI_DQCHUNKLEN(mp),
+                                               0, &bp))) {
+                        return (error);
+                }
+                if (error || !bp)
+                        return XFS_ERROR(error);
+        }
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+        /*
+         * calculate the location of the dquot inside the buffer.
+         */
+        ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+        /*
+         * A simple sanity check in case we got a corrupted dquot...
+         */
+        if (xfs_qm_dqcheck(ddq, id,
+                           dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
+                           flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
+                           "dqtobp")) {
+                if (!(flags & XFS_QMOPT_DQREPAIR)) {
+                        xfs_trans_brelse(tp, bp);
+                        return XFS_ERROR(EIO);
+                }
+                XFS_BUF_BUSY(bp); /* We dirtied this */
+        }
+        *O_bpp = bp;
+        *O_ddpp = ddq;
+        return (0);
+}
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqread(
+        xfs_trans_t     *tp,
+        xfs_dqid_t      id,
+        xfs_dquot_t     *dqp,   /* dquot to get filled in */
+        uint            flags)
+{
+        xfs_disk_dquot_t *ddqp;
+        xfs_buf_t        *bp;
+        int              error;
+        /*
+         * get a pointer to the on-disk dquot and the buffer containing it
+         * dqp already knows its own type (GROUP/USER).
+         */
+        xfs_dqtrace_entry(dqp, "DQREAD");
+        if ((error = xfs_qm_dqtobp(tp, dqp, &ddqp, &bp, flags))) {
+                return (error);
+        }
+        /* copy everything from disk dquot to the incore dquot */
+        memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+        ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
+        xfs_qm_dquot_logitem_init(dqp);
+        /*
+         * Reservation counters are defined as reservation plus current usage
+         * to avoid having to add everytime.
+         */
+        dqp->q_res_bcount = INT_GET(ddqp->d_bcount, ARCH_CONVERT);
+        dqp->q_res_icount = INT_GET(ddqp->d_icount, ARCH_CONVERT);
+        dqp->q_res_rtbcount = INT_GET(ddqp->d_rtbcount, ARCH_CONVERT);
+        /* Mark the buf so that this will stay incore a little longer */
+        XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+        /*
+         * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+         * So we need to release with xfs_trans_brelse().
+         * The strategy here is identical to that of inodes; we lock
+         * the dquot in xfs_qm_dqget() before making it accessible to
+         * others. This is because dquots, like inodes, need a good level of
+         * concurrency, and we don't want to take locks on the entire buffers
+         * for dquot accesses.
+         * Note also that the dquot buffer may even be dirty at this point, if
+         * this particular dquot was repaired. We still aren't afraid to
+         * brelse it because we have the changes incore.
+         */
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+        xfs_trans_brelse(tp, bp);
+        return (error);
+}
+/*
+ * allocate an incore dquot from the kernel heap,
+ * and fill its core with quota information kept on disk.
+ * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
+ * if it wasn't already allocated.
+ */
+STATIC int
+xfs_qm_idtodq(
+        xfs_mount_t     *mp,
+        xfs_dqid_t      id,      /* gid or uid, depending on type */
+        uint            type,    /* UDQUOT or GDQUOT */
+        uint            flags,   /* DQALLOC, DQREPAIR */
+        xfs_dquot_t     **O_dqpp)/* OUT : incore dquot, not locked */
+{
+        xfs_dquot_t     *dqp;
+        int             error;
+        xfs_trans_t     *tp;
+        int             cancelflags=0;
+        dqp = xfs_qm_dqinit(mp, id, type);
+        tp = NULL;
+        if (flags & XFS_QMOPT_DQALLOC) {
+                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+                if ((error = xfs_trans_reserve(tp,
+                                       XFS_QM_DQALLOC_SPACE_RES(mp),
+                                       XFS_WRITE_LOG_RES(mp) +
+                                              BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
+                                              128,
+                                       0,
+                                       XFS_TRANS_PERM_LOG_RES,
+                                       XFS_WRITE_LOG_COUNT))) {
+                        cancelflags = 0;
+                        goto error0;
+                }
+                cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+        }
+        /*
+         * Read it from disk; xfs_dqread() takes care of
+         * all the necessary initialization of dquot's fields (locks, etc)
+         */
+        if ((error = xfs_qm_dqread(tp, id, dqp, flags))) {
+                /*
+                 * This can happen if quotas got turned off (ESRCH),
+                 * or if the dquot didn't exist on disk and we ask to
+                 * allocate (ENOENT).
+                 */
+                xfs_dqtrace_entry(dqp, "DQREAD FAIL");
+                cancelflags |= XFS_TRANS_ABORT;
+                goto error0;
+        }
+        if (tp) {
+                if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+                                             NULL)))
+                        goto error1;
+        }
+        *O_dqpp = dqp;
+        return (0);
+ error0:
+        ASSERT(error);
+        if (tp)
+                xfs_trans_cancel(tp, cancelflags);
+ error1:
+        xfs_qm_dqdestroy(dqp);
+        *O_dqpp = NULL;
+        return (error);
+}
+/*
+ * Lookup a dquot in the incore dquot hashtable. We keep two separate
+ * hashtables for user and group dquots; and, these are global tables
+ * inside the XQM, not per-filesystem tables.
+ * The hash chain must be locked by caller, and it is left locked
+ * on return. Returning dquot is locked.
+ */
+STATIC int
+xfs_qm_dqlookup(
+        xfs_mount_t             *mp,
+        xfs_dqid_t              id,
+        xfs_dqhash_t            *qh,
+        xfs_dquot_t             **O_dqpp)
+{
+        xfs_dquot_t             *dqp;
+        uint                    flist_locked;
+        xfs_dquot_t             *d;
+        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+        flist_locked = B_FALSE;
+        /*
+         * Traverse the hashchain looking for a match
+         */
+        for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+                /*
+                 * We already have the hashlock. We don't need the
+                 * dqlock to look at the id field of the dquot, since the
+                 * id can't be modified without the hashlock anyway.
+                 */
+                if (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id && dqp->q_mount == mp) {
+                        xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
+                        /*
+                         * All in core dquots must be on the dqlist of mp
+                         */
+                        ASSERT(dqp->MPL_PREVP != NULL);
+                        xfs_dqlock(dqp);
+                        if (dqp->q_nrefs == 0) {
+                                ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
+                                if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+                                        xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
+                                        /*
+                                         * We may have raced with dqreclaim_one()
+                                         * (and lost). So, flag that we don't
+                                         * want the dquot to be reclaimed.
+                                         */
+                                        dqp->dq_flags |= XFS_DQ_WANT;
+                                        xfs_dqunlock(dqp);
+                                        xfs_qm_freelist_lock(xfs_Gqm);
+                                        xfs_dqlock(dqp);
+                                        dqp->dq_flags &= ~(XFS_DQ_WANT);
+                                }
+                                flist_locked = B_TRUE;
+                        }
+                        /*
+                         * id couldn't have changed; we had the hashlock all
+                         * along
+                         */
+                        ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
+                        if (flist_locked) {
+                                if (dqp->q_nrefs != 0) {
+                                        xfs_qm_freelist_unlock(xfs_Gqm);
+                                        flist_locked = B_FALSE;
+                                } else {
+                                        /*
+                                         * take it off the freelist
+                                         */
+                                        xfs_dqtrace_entry(dqp,
+                                                        "DQLOOKUP: TAKEOFF FL");
+                                        XQM_FREELIST_REMOVE(dqp);
+                                        /* xfs_qm_freelist_print(&(xfs_Gqm->
+                                                        qm_dqfreelist),
+                                                        "after removal"); */
+                                }
+                        }
+                        /*
+                         * grab a reference
+                         */
+                        XFS_DQHOLD(dqp);
+                        if (flist_locked)
+                                xfs_qm_freelist_unlock(xfs_Gqm);
+                        /*
+                         * move the dquot to the front of the hashchain
+                         */
+                        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+                        if (dqp->HL_PREVP != &qh->qh_next) {
+                                xfs_dqtrace_entry(dqp,
+                                                  "DQLOOKUP: HASH MOVETOFRONT");
+                                if ((d = dqp->HL_NEXT))
+                                        d->HL_PREVP = dqp->HL_PREVP;
+                                *(dqp->HL_PREVP) = d;
+                                d = qh->qh_next;
+                                d->HL_PREVP = &dqp->HL_NEXT;
+                                dqp->HL_NEXT = d;
+                                dqp->HL_PREVP = &qh->qh_next;
+                                qh->qh_next = dqp;
+                        }
+                        xfs_dqtrace_entry(dqp, "LOOKUP END");
+                        *O_dqpp = dqp;
+                        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+                        return (0);
+                }
+        }
+        *O_dqpp = NULL;
+        ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+        return (1);
+}
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we don't hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+        xfs_mount_t     *mp,
+        xfs_inode_t     *ip,      /* locked inode (optional) */
+        xfs_dqid_t      id,       /* gid or uid, depending on type */
+        uint            type,     /* UDQUOT or GDQUOT */
+        uint            flags,    /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+        xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
+{
+        xfs_dquot_t     *dqp;
+        xfs_dqhash_t    *h;
+        uint            version;
+        int             error;
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+            (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+                return (ESRCH);
+        }
+        h = XFS_DQ_HASH(mp, id, type);
+#ifdef DEBUG
+        if (xfs_do_dqerror) {
+                if ((xfs_dqerror_target == mp->m_ddev_targp) &&
+                    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+                        cmn_err(CE_DEBUG, "Returning error in dqget");
+                        return (EIO);
+                }
+        }
+#endif
+ again:
+#ifdef DEBUG
+        ASSERT(type == XFS_DQ_USER || type == XFS_DQ_GROUP);
+        if (ip) {
+                ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+                if (type == XFS_DQ_USER)
+                        ASSERT(ip->i_udquot == NULL);
+                else
+                        ASSERT(ip->i_gdquot == NULL);
+        }
+#endif
+        XFS_DQ_HASH_LOCK(h);
+        /*
+         * Look in the cache (hashtable).
+         * The chain is kept locked during lookup.
+         */
+        if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
+                XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
+                /*
+                 * The dquot was found, moved to the front of the chain,
+                 * taken off the freelist if it was on it, and locked
+                 * at this point. Just unlock the hashchain and return.
+                 */
+                ASSERT(*O_dqpp);
+                ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
+                XFS_DQ_HASH_UNLOCK(h);
+                xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
+                return (0);     /* success */
+        }
+        XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
+        /*
+         * Dquot cache miss. We don't want to keep the inode lock across
+         * a (potential) disk read. Also we don't want to deal with the lock
+         * ordering between quotainode and this inode. OTOH, dropping the inode
+         * lock here means dealing with a chown that can happen before
+         * we re-acquire the lock.
+         */
+        if (ip)
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        /*
+         * Save the hashchain version stamp, and unlock the chain, so that
+         * we don't keep the lock across a disk read
+         */
+        version = h->qh_version;
+        XFS_DQ_HASH_UNLOCK(h);
+        /*
+         * Allocate the dquot on the kernel heap, and read the ondisk
+         * portion off the disk. Also, do all the necessary initialization
+         * This can return ENOENT if dquot didn't exist on disk and we didn't
+         * ask it to allocate; ESRCH if quotas got turned off suddenly.
+         */
+        if ((error = xfs_qm_idtodq(mp, id, type,
+                                  flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
+                                           XFS_QMOPT_DOWARN),
+                                  &dqp))) {
+                if (ip)
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                return (error);
+        }
+        /*
+         * See if this is mount code calling to look at the overall quota limits
+         * which are stored in the id == 0 user or group's dquot.
+         * Since we may not have done a quotacheck by this point, just return
+         * the dquot without attaching it to any hashtables, lists, etc, or even
+         * taking a reference.
+         * The caller must dqdestroy this once done.
+         */
+        if (flags & XFS_QMOPT_DQSUSER) {
+                ASSERT(id == 0);
+                ASSERT(! ip);
+                goto dqret;
+        }
+        /*
+         * Dquot lock comes after hashlock in the lock ordering
+         */
+        if (ip) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                if (! XFS_IS_DQTYPE_ON(mp, type)) {
+                        /* inode stays locked on return */
+                        xfs_qm_dqdestroy(dqp);
+                        return XFS_ERROR(ESRCH);
+                }
+                /*
+                 * A dquot could be attached to this inode by now, since
+                 * we had dropped the ilock.
+                 */
+                if (type == XFS_DQ_USER) {
+                        if (ip->i_udquot) {
+                                xfs_qm_dqdestroy(dqp);
+                                dqp = ip->i_udquot;
+                                xfs_dqlock(dqp);
+                                goto dqret;
+                        }
+                } else {
+                        if (ip->i_gdquot) {
+                                xfs_qm_dqdestroy(dqp);
+                                dqp = ip->i_gdquot;
+                                xfs_dqlock(dqp);
+                                goto dqret;
+                        }
+                }
+        }
+        /*
+         * Hashlock comes after ilock in lock order
+         */
+        XFS_DQ_HASH_LOCK(h);
+        if (version != h->qh_version) {
+                xfs_dquot_t *tmpdqp;
+                /*
+                 * Now, see if somebody else put the dquot in the
+                 * hashtable before us. This can happen because we didn't
+                 * keep the hashchain lock. We don't have to worry about
+                 * lock order between the two dquots here since dqp isn't
+                 * on any findable lists yet.
+                 */
+                if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
+                        /*
+                         * Duplicate found. Just throw away the new dquot
+                         * and start over.
+                         */
+                        xfs_qm_dqput(tmpdqp);
+                        XFS_DQ_HASH_UNLOCK(h);
+                        xfs_qm_dqdestroy(dqp);
+                        XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
+                        goto again;
+                }
+        }
+        /*
+         * Put the dquot at the beginning of the hash-chain and mp's list
+         * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
+         */
+        ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
+        dqp->q_hash = h;
+        XQM_HASHLIST_INSERT(h, dqp);
+        /*
+         * Attach this dquot to this filesystem's list of all dquots,
+         * kept inside the mount structure in m_quotainfo field
+         */
+        xfs_qm_mplist_lock(mp);
+        /*
+         * We return a locked dquot to the caller, with a reference taken
+         */
+        xfs_dqlock(dqp);
+        dqp->q_nrefs = 1;
+        XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
+        xfs_qm_mplist_unlock(mp);
+        XFS_DQ_HASH_UNLOCK(h);
+ dqret:
+        ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip));
+        xfs_dqtrace_entry(dqp, "DQGET DONE");
+        *O_dqpp = dqp;
+        return (0);
+}
+/*
+ * Release a reference to the dquot (decrement ref-count)
+ * and unlock it. If there is a group quota attached to this
+ * dquot, carefully release that too without tripping over
+ * deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+        xfs_dquot_t     *dqp)
+{
+        xfs_dquot_t     *gdqp;
+        ASSERT(dqp->q_nrefs > 0);
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        xfs_dqtrace_entry(dqp, "DQPUT");
+        if (dqp->q_nrefs != 1) {
+                dqp->q_nrefs--;
+                xfs_dqunlock(dqp);
+                return;
+        }
+        /*
+         * drop the dqlock and acquire the freelist and dqlock
+         * in the right order; but try to get it out-of-order first
+         */
+        if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+                xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
+                xfs_dqunlock(dqp);
+                xfs_qm_freelist_lock(xfs_Gqm);
+                xfs_dqlock(dqp);
+        }
+        while (1) {
+                gdqp = NULL;
+                /* We can't depend on nrefs being == 1 here */
+                if (--dqp->q_nrefs == 0) {
+                        xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
+                        /*
+                         * insert at end of the freelist.
+                         */
+                        XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
+                        /*
+                         * If we just added a udquot to the freelist, then
+                         * we want to release the gdquot reference that
+                         * it (probably) has. Otherwise it'll keep the
+                         * gdquot from getting reclaimed.
+                         */
+                        if ((gdqp = dqp->q_gdquot)) {
+                                /*
+                                 * Avoid a recursive dqput call
+                                 */
+                                xfs_dqlock(gdqp);
+                                dqp->q_gdquot = NULL;
+                        }
+                        /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
+                           "@@@@@++ Free list (after append) @@@@@+");
+                           */
+                }
+                xfs_dqunlock(dqp);
+                /*
+                 * If we had a group quota inside the user quota as a hint,
+                 * release it now.
+                 */
+                if (! gdqp)
+                        break;
+                dqp = gdqp;
+        }
+        xfs_qm_freelist_unlock(xfs_Gqm);
+}
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+        xfs_dquot_t     *dqp)
+{
+        ASSERT(dqp);
+        xfs_dqtrace_entry(dqp, "DQRELE");
+        xfs_dqlock(dqp);
+        /*
+         * We don't care to flush it if the dquot is dirty here.
+         * That will create stutters that we want to avoid.
+         * Instead we do a delayed write when we try to reclaim
+         * a dirty dquot. Also xfs_sync will take part of the burden...
+         */
+        xfs_qm_dqput(dqp);
+}
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+        xfs_dquot_t             *dqp,
+        uint                    flags)
+{
+        xfs_mount_t             *mp;
+        xfs_buf_t               *bp;
+        xfs_disk_dquot_t        *ddqp;
+        int                     error;
+        SPLDECL(s);
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        xfs_dqtrace_entry(dqp, "DQFLUSH");
+        /*
+         * If not dirty, nada.
+         */
+        if (!XFS_DQ_IS_DIRTY(dqp)) {
+                xfs_dqfunlock(dqp);
+                return (0);
+        }
+        /*
+         * Cant flush a pinned dquot. Wait for it.
+         */
+        xfs_qm_dqunpin_wait(dqp);
+        /*
+         * This may have been unpinned because the filesystem is shutting
+         * down forcibly. If that's the case we must not write this dquot
+         * to disk, because the log record didn't make it to disk!
+         */
+        if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
+                dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+                xfs_dqfunlock(dqp);
+                return XFS_ERROR(EIO);
+        }
+        /*
+         * Get the buffer containing the on-disk dquot
+         * We don't need a transaction envelope because we know that the
+         * the ondisk-dquot has already been allocated for.
+         */
+        if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+                xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
+                ASSERT(error != ENOENT);
+                /*
+                 * Quotas could have gotten turned off (ESRCH)
+                 */
+                xfs_dqfunlock(dqp);
+                return (error);
+        }
+        if (xfs_qm_dqcheck(&dqp->q_core, INT_GET(ddqp->d_id, ARCH_CONVERT), 0, XFS_QMOPT_DOWARN,
+                           "dqflush (incore copy)")) {
+                xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
+                return XFS_ERROR(EIO);
+        }
+        /* This is the only portion of data that needs to persist */
+        memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t));
+        /*
+         * Clear the dirty field and remember the flush lsn for later use.
+         */
+        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+        mp = dqp->q_mount;
+        /* lsn is 64 bits */
+        AIL_LOCK(mp, s);
+        dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
+        AIL_UNLOCK(mp, s);
+        /*
+         * Attach an iodone routine so that we can remove this dquot from the
+         * AIL and release the flush lock once the dquot is synced to disk.
+         */
+        xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
+                              xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
+        /*
+         * If the buffer is pinned then push on the log so we won't
+         * get stuck waiting in the write for too long.
+         */
+        if (XFS_BUF_ISPINNED(bp)) {
+                xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
+                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+        }
+        if (flags & XFS_QMOPT_DELWRI) {
+                xfs_bdwrite(mp, bp);
+        } else if (flags & XFS_QMOPT_ASYNC) {
+                xfs_bawrite(mp, bp);
+        } else {
+                error = xfs_bwrite(mp, bp);
+        }
+        xfs_dqtrace_entry(dqp, "DQFLUSH END");
+        /*
+         * dqp is still locked, but caller is free to unlock it now.
+         */
+        return (error);
+}
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_dqflush_done(
+        xfs_buf_t               *bp,
+        xfs_dq_logitem_t        *qip)
+{
+        xfs_dquot_t             *dqp;
+        SPLDECL(s);
+        dqp = qip->qli_dquot;
+        /*
+         * We only want to pull the item from the AIL if its
+         * location in the log has not changed since we started the flush.
+         * Thus, we only bother if the dquot's lsn has
+         * not changed. First we check the lsn outside the lock
+         * since it's cheaper, and then we recheck while
+         * holding the lock before removing the dquot from the AIL.
+         */
+        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
+            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
+                AIL_LOCK(dqp->q_mount, s);
+                /*
+                 * xfs_trans_delete_ail() drops the AIL lock.
+                 */
+                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
+                        xfs_trans_delete_ail(dqp->q_mount,
+                                             (xfs_log_item_t*)qip, s);
+                else
+                        AIL_UNLOCK(dqp->q_mount, s);
+        }
+        /*
+         * Release the dq's flush lock since we're done with it.
+         */
+        xfs_dqfunlock(dqp);
+}
+int
+xfs_qm_dqflock_nowait(
+        xfs_dquot_t *dqp)
+{
+        int locked;
+        locked = cpsema(&((dqp)->q_flock));
+        /* XXX ifdef these out */
+        if (locked)
+                (dqp)->dq_flags |= XFS_DQ_FLOCKED;
+        return (locked);
+}
+int
+xfs_qm_dqlock_nowait(
+        xfs_dquot_t *dqp)
+{
+        return (mutex_trylock(&((dqp)->q_qlock)));
+}
+void
+xfs_dqlock(
+        xfs_dquot_t *dqp)
+{
+        mutex_lock(&(dqp->q_qlock), PINOD);
+}
+void
+xfs_dqunlock(
+        xfs_dquot_t *dqp)
+{
+        mutex_unlock(&(dqp->q_qlock));
+        if (dqp->q_logitem.qli_dquot == dqp) {
+                /* Once was dqp->q_mount, but might just have been cleared */
+                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+                                        (xfs_log_item_t*)&(dqp->q_logitem));
+        }
+}
+void
+xfs_dqunlock_nonotify(
+        xfs_dquot_t *dqp)
+{
+        mutex_unlock(&(dqp->q_qlock));
+}
+void
+xfs_dqlock2(
+        xfs_dquot_t     *d1,
+        xfs_dquot_t     *d2)
+{
+        if (d1 && d2) {
+                ASSERT(d1 != d2);
+                if (INT_GET(d1->q_core.d_id, ARCH_CONVERT) > INT_GET(d2->q_core.d_id, ARCH_CONVERT)) {
+                        xfs_dqlock(d2);
+                        xfs_dqlock(d1);
+                } else {
+                        xfs_dqlock(d1);
+                        xfs_dqlock(d2);
+                }
+        } else {
+                if (d1) {
+                        xfs_dqlock(d1);
+                } else if (d2) {
+                        xfs_dqlock(d2);
+                }
+        }
+}
+/*
+ * Take a dquot out of the mount's dqlist as well as the hashlist.
+ * This is called via unmount as well as quotaoff, and the purge
+ * will always succeed unless there are soft (temp) references
+ * outstanding.
+ *
+ * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
+ * that we're returning! XXXsup - not cool.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqpurge(
+        xfs_dquot_t     *dqp,
+        uint            flags)
+{
+        xfs_dqhash_t    *thishash;
+        xfs_mount_t     *mp;
+        mp = dqp->q_mount;
+        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
+        xfs_dqlock(dqp);
+        /*
+         * We really can't afford to purge a dquot that is
+         * referenced, because these are hard refs.
+         * It shouldn't happen in general because we went thru _all_ inodes in
+         * dqrele_all_inodes before calling this and didn't let the mountlock go.
+         * However it is possible that we have dquots with temporary
+         * references that are not attached to an inode. e.g. see xfs_setattr().
+         */
+        if (dqp->q_nrefs != 0) {
+                xfs_dqunlock(dqp);
+                XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+                return (1);
+        }
+        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        /*
+         * If we're turning off quotas, we have to make sure that, for
+         * example, we don't delete quota disk blocks while dquots are
+         * in the process of getting written to those disk blocks.
+         * This dquot might well be on AIL, and we can't leave it there
+         * if we're turning off quotas. Basically, we need this flush
+         * lock, and are willing to block on it.
+         */
+        if (! xfs_qm_dqflock_nowait(dqp)) {
+                /*
+                 * Block on the flush lock after nudging dquot buffer,
+                 * if it is incore.
+                 */
+                xfs_qm_dqflock_pushbuf_wait(dqp);
+        }
+        /*
+         * XXXIf we're turning this type of quotas off, we don't care
+         * about the dirty metadata sitting in this dquot. OTOH, if
+         * we're unmounting, we do care, so we flush it and wait.
+         */
+        if (XFS_DQ_IS_DIRTY(dqp)) {
+                xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
+                /* dqflush unlocks dqflock */
+                /*
+                 * Given that dqpurge is a very rare occurrence, it is OK
+                 * that we're holding the hashlist and mplist locks
+                 * across the disk write. But, ... XXXsup
+                 *
+                 * We don't care about getting disk errors here. We need
+                 * to purge this dquot anyway, so we go ahead regardless.
+                 */
+                (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+                xfs_dqflock(dqp);
+        }
+        ASSERT(dqp->q_pincount == 0);
+        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+        thishash = dqp->q_hash;
+        XQM_HASHLIST_REMOVE(thishash, dqp);
+        XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+        /*
+         * XXX Move this to the front of the freelist, if we can get the
+         * freelist lock.
+         */
+        ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+        dqp->q_mount = NULL;
+        dqp->q_hash = NULL;
+        dqp->dq_flags = XFS_DQ_INACTIVE;
+        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+        xfs_dqfunlock(dqp);
+        xfs_dqunlock(dqp);
+        XFS_DQ_HASH_UNLOCK(thishash);
+        return (0);
+}
+#ifdef QUOTADEBUG
+void
+xfs_qm_dqprint(xfs_dquot_t *dqp)
+{
+        cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
+        cmn_err(CE_DEBUG, "---- dquotID =  %d",
+                (int)INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- type    =  %s",
+                XFS_QM_ISUDQ(dqp) ? "USR" : "GRP");
+        cmn_err(CE_DEBUG, "---- fs      =  0x%p", dqp->q_mount);
+        cmn_err(CE_DEBUG, "---- blkno   =  0x%x", (int) dqp->q_blkno);
+        cmn_err(CE_DEBUG, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
+        cmn_err(CE_DEBUG, "---- blkhlimit =  %Lu (0x%x)",
+                INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT),
+                (int) INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- blkslimit =  %Lu (0x%x)",
+                INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT),
+                (int)INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- inohlimit =  %Lu (0x%x)",
+                INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT),
+                (int)INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- inoslimit =  %Lu (0x%x)",
+                INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT),
+                (int)INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- bcount  =  %Lu (0x%x)",
+                INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
+                (int)INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- icount  =  %Lu (0x%x)",
+                INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
+                (int)INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- btimer  =  %d",
+                (int)INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---- itimer  =  %d",
+                (int)INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT));
+        cmn_err(CE_DEBUG, "---------------------------");
+}
+#endif
+/*
+ * Give the buffer a little push if it is incore and
+ * wait on the flush lock.
+ */
+void
+xfs_qm_dqflock_pushbuf_wait(
+        xfs_dquot_t     *dqp)
+{
+        xfs_buf_t       *bp;
+        /*
+         * Check to see if the dquot has been flushed delayed
+         * write.  If so, grab its buffer and send it
+         * out immediately.  We'll be able to acquire
+         * the flush lock when the I/O completes.
+         */
+        bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
+                    XFS_QI_DQCHUNKLEN(dqp->q_mount),
+                    XFS_INCORE_TRYLOCK);
+        if (bp != NULL) {
+                if (XFS_BUF_ISDELAYWRITE(bp)) {
+                        if (XFS_BUF_ISPINNED(bp)) {
+                                xfs_log_force(dqp->q_mount,
+                                              (xfs_lsn_t)0,
+                                              XFS_LOG_FORCE);
+                        }
+                        xfs_bawrite(dqp->q_mount, bp);
+                } else {
+                        xfs_buf_relse(bp);
+                }
+        }
+        xfs_dqflock(dqp);
+}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
new file mode 100644
index 000000000000..0c3fe3175baa
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+/*
+ * The hash chain headers (hash buckets)
+ */
+typedef struct xfs_dqhash {
+        struct xfs_dquot *qh_next;
+        mutex_t           qh_lock;
+        uint              qh_version;   /* ever increasing version */
+        uint              qh_nelems;    /* number of dquots on the list */
+} xfs_dqhash_t;
+typedef struct xfs_dqlink {
+        struct xfs_dquot  *ql_next;     /* forward link */
+        struct xfs_dquot **ql_prevp;    /* pointer to prev ql_next */
+} xfs_dqlink_t;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * This is the marker which is designed to occupy the first few
+ * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
+ * must come first.
+ * This serves as the marker ("sentinel") when we have to restart list
+ * iterations because of locking considerations.
+ */
+typedef struct xfs_dqmarker {
+        struct xfs_dquot*dqm_flnext;    /* link to freelist: must be first */
+        struct xfs_dquot*dqm_flprev;
+        xfs_dqlink_t     dqm_mplist;    /* link to mount's list of dquots */
+        xfs_dqlink_t     dqm_hashlist;  /* link to the hash chain */
+        uint             dqm_flags;     /* various flags (XFS_DQ_*) */
+} xfs_dqmarker_t;
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+        xfs_dqmarker_t   q_lists;       /* list ptrs, q_flags (marker) */
+        xfs_dqhash_t    *q_hash;        /* the hashchain header */
+        struct xfs_mount*q_mount;       /* filesystem this relates to */
+        struct xfs_trans*q_transp;      /* trans this belongs to currently */
+        uint             q_nrefs;       /* # active refs from inodes */
+        xfs_daddr_t      q_blkno;       /* blkno of dquot buffer */
+        int              q_bufoffset;   /* off of dq in buffer (# dquots) */
+        xfs_fileoff_t    q_fileoffset;  /* offset in quotas file */
+        struct xfs_dquot*q_gdquot;      /* group dquot, hint only */
+        xfs_disk_dquot_t q_core;        /* actual usage & quotas */
+        xfs_dq_logitem_t q_logitem;     /* dquot log item */
+        xfs_qcnt_t       q_res_bcount;  /* total regular nblks used+reserved */
+        xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
+        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
+        mutex_t          q_qlock;       /* quota lock */
+        sema_t           q_flock;       /* flush lock */
+        uint             q_pincount;    /* pin count for this dquot */
+        sv_t             q_pinwait;     /* sync var for pinning */
+#ifdef XFS_DQUOT_TRACE
+        struct ktrace   *q_trace;       /* trace header structure */
+#endif
+} xfs_dquot_t;
+#define dq_flnext       q_lists.dqm_flnext
+#define dq_flprev       q_lists.dqm_flprev
+#define dq_mplist       q_lists.dqm_mplist
+#define dq_hashlist     q_lists.dqm_hashlist
+#define dq_flags        q_lists.dqm_flags
+#define XFS_DQHOLD(dqp)         ((dqp)->q_nrefs++)
+/*
+ * Quota Accounting flags
+ */
+#define XFS_ALL_QUOTA_ACCT      (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD      (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD      (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ACTV      (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+#define XFS_ALL_QUOTA_ACCT_ENFD (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+                                 XFS_GQUOTA_ACCT|XFS_GQUOTA_ENFD)
+#define XFS_IS_QUOTA_RUNNING(mp)  ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+/*
+ * Quota Limit Enforcement flags
+ */
+#define XFS_IS_QUOTA_ENFORCED(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
+#define XFS_IS_UQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)      ((mp)->m_qflags & XFS_GQUOTA_ENFD)
+#ifdef DEBUG
+static inline int
+XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
+{
+        if (mutex_trylock(&dqp->q_qlock)) {
+                mutex_unlock(&dqp->q_qlock);
+                return 0;
+        }
+        return 1;
+}
+#endif
+/*
+ * The following three routines simply manage the q_flock
+ * semaphore embedded in the dquot.  This semaphore synchronizes
+ * processes attempting to flush the in-core dquot back to disk.
+ */
+#define xfs_dqflock(dqp)         { psema(&((dqp)->q_flock), PINOD | PRECALC);\
+                                   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
+#define xfs_dqfunlock(dqp)       { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
+                                   vsema(&((dqp)->q_flock)); \
+                                   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+#define XFS_DQ_PINLOCK(dqp)        mutex_spinlock( \
+                                     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock))
+#define XFS_DQ_PINUNLOCK(dqp, s)   mutex_spinunlock( \
+                                     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
+#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
+#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
+#define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_DQ_TO_QINF(dqp)     ((dqp)->q_mount->m_quotainfo)
+#define XFS_DQ_TO_QIP(dqp)      (XFS_QM_ISUDQ(dqp) ? \
+                                 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
+                                 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
+#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
+                                     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
+                                     (XFS_IS_GQUOTA_ON((d)->q_mount))))
+#ifdef XFS_DQUOT_TRACE
+/*
+ * Dquot Tracing stuff.
+ */
+#define DQUOT_TRACE_SIZE        64
+#define DQUOT_KTRACE_ENTRY      1
+extern void             __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func,
+                                            void *, xfs_inode_t *);
+#define xfs_dqtrace_entry_ino(a,b,ip) \
+                __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip))
+#define xfs_dqtrace_entry(a,b) \
+                __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL)
+#else
+#define xfs_dqtrace_entry(a,b)
+#define xfs_dqtrace_entry_ino(a,b,ip)
+#endif
+#ifdef QUOTADEBUG
+extern void             xfs_qm_dqprint(xfs_dquot_t *);
+#else
+#define xfs_qm_dqprint(a)
+#endif
+extern void             xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int              xfs_qm_dqflush(xfs_dquot_t *, uint);
+extern int              xfs_qm_dqpurge(xfs_dquot_t *, uint);
+extern void             xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern int              xfs_qm_dqlock_nowait(xfs_dquot_t *);
+extern int              xfs_qm_dqflock_nowait(xfs_dquot_t *);
+extern void             xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
+extern void             xfs_qm_adjust_dqtimers(xfs_mount_t *,
+                                        xfs_disk_dquot_t *);
+extern void             xfs_qm_adjust_dqlimits(xfs_mount_t *,
+                                        xfs_disk_dquot_t *);
+extern int              xfs_qm_dqwarn(xfs_disk_dquot_t *, uint);
+extern int              xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
+                                        xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern void             xfs_qm_dqput(xfs_dquot_t *);
+extern void             xfs_qm_dqrele(xfs_dquot_t *);
+extern void             xfs_dqlock(xfs_dquot_t *);
+extern void             xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
+extern void             xfs_dqunlock(xfs_dquot_t *);
+extern void             xfs_dqunlock_nonotify(xfs_dquot_t *);
+#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
new file mode 100644
index 000000000000..a5425ee6e7bd
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+/* ARGSUSED */
+STATIC uint
+xfs_qm_dquot_logitem_size(
+        xfs_dq_logitem_t        *logitem)
+{
+        /*
+         * we need only two iovecs, one for the format, one for the real thing
+         */
+        return (2);
+}
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+        xfs_dq_logitem_t        *logitem,
+        xfs_log_iovec_t         *logvec)
+{
+        ASSERT(logitem);
+        ASSERT(logitem->qli_dquot);
+        logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
+        logvec->i_len  = sizeof(xfs_dq_logformat_t);
+        logvec++;
+        logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
+        logvec->i_len  = sizeof(xfs_disk_dquot_t);
+        ASSERT(2 == logitem->qli_item.li_desc->lid_size);
+        logitem->qli_format.qlf_size = 2;
+}
+/*
+ * Increment the pin count of the given dquot.
+ * This value is protected by pinlock spinlock in the xQM structure.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+        xfs_dq_logitem_t *logitem)
+{
+        unsigned long   s;
+        xfs_dquot_t *dqp;
+        dqp = logitem->qli_dquot;
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        s = XFS_DQ_PINLOCK(dqp);
+        dqp->q_pincount++;
+        XFS_DQ_PINUNLOCK(dqp, s);
+}
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0.  The
+ * dquot must have been previously pinned with a call to xfs_dqpin().
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+        xfs_dq_logitem_t *logitem,
+        int               stale)
+{
+        unsigned long   s;
+        xfs_dquot_t *dqp;
+        dqp = logitem->qli_dquot;
+        ASSERT(dqp->q_pincount > 0);
+        s = XFS_DQ_PINLOCK(dqp);
+        dqp->q_pincount--;
+        if (dqp->q_pincount == 0) {
+                sv_broadcast(&dqp->q_pinwait);
+        }
+        XFS_DQ_PINUNLOCK(dqp, s);
+}
+/* ARGSUSED */
+STATIC void
+xfs_qm_dquot_logitem_unpin_remove(
+        xfs_dq_logitem_t *logitem,
+        xfs_trans_t      *tp)
+{
+        xfs_qm_dquot_logitem_unpin(logitem, 0);
+}
+/*
+ * Given the logitem, this writes the corresponding dquot entry to disk
+ * asynchronously. This is called with the dquot entry securely locked;
+ * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
+ * at the end.
+ */
+STATIC void
+xfs_qm_dquot_logitem_push(
+        xfs_dq_logitem_t        *logitem)
+{
+        xfs_dquot_t     *dqp;
+        dqp = logitem->qli_dquot;
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+        /*
+         * Since we were able to lock the dquot's flush lock and
+         * we found it on the AIL, the dquot must be dirty.  This
+         * is because the dquot is removed from the AIL while still
+         * holding the flush lock in xfs_dqflush_done().  Thus, if
+         * we found it in the AIL and were able to obtain the flush
+         * lock without sleeping, then there must not have been
+         * anyone in the process of flushing the dquot.
+         */
+        xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+        xfs_dqunlock(dqp);
+}
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+        xfs_dq_logitem_t        *l,
+        xfs_lsn_t               lsn)
+{
+        /*
+         * We always re-log the entire dquot when it becomes dirty,
+         * so, the latest copy _is_ the only one that matters.
+         */
+        return (lsn);
+}
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+        xfs_dquot_t     *dqp)
+{
+        SPLDECL(s);
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        if (dqp->q_pincount == 0) {
+                return;
+        }
+        /*
+         * Give the log a push so we don't wait here too long.
+         */
+        xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
+        s = XFS_DQ_PINLOCK(dqp);
+        if (dqp->q_pincount == 0) {
+                XFS_DQ_PINUNLOCK(dqp, s);
+                return;
+        }
+        sv_wait(&(dqp->q_pinwait), PINOD,
+                &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
+}
+/*
+ * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
+ * the dquot is locked by us, but the flush lock isn't. So, here we are
+ * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
+ * If so, we want to push it out to help us take this item off the AIL as soon
+ * as possible.
+ *
+ * We must not be holding the AIL_LOCK at this point. Calling incore() to
+ * search the buffercache can be a time consuming thing, and AIL_LOCK is a
+ * spinlock.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pushbuf(
+        xfs_dq_logitem_t    *qip)
+{
+        xfs_dquot_t     *dqp;
+        xfs_mount_t     *mp;
+        xfs_buf_t       *bp;
+        uint            dopush;
+        dqp = qip->qli_dquot;
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        /*
+         * The qli_pushbuf_flag keeps others from
+         * trying to duplicate our effort.
+         */
+        ASSERT(qip->qli_pushbuf_flag != 0);
+        ASSERT(qip->qli_push_owner == get_thread_id());
+        /*
+         * If flushlock isn't locked anymore, chances are that the
+         * inode flush completed and the inode was taken off the AIL.
+         * So, just get out.
+         */
+        if ((valusema(&(dqp->q_flock)) > 0)  ||
+            ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+                qip->qli_pushbuf_flag = 0;
+                xfs_dqunlock(dqp);
+                return;
+        }
+        mp = dqp->q_mount;
+        bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
+                    XFS_QI_DQCHUNKLEN(mp),
+                    XFS_INCORE_TRYLOCK);
+        if (bp != NULL) {
+                if (XFS_BUF_ISDELAYWRITE(bp)) {
+                        dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
+                                  (valusema(&(dqp->q_flock)) <= 0));
+                        qip->qli_pushbuf_flag = 0;
+                        xfs_dqunlock(dqp);
+                        if (XFS_BUF_ISPINNED(bp)) {
+                                xfs_log_force(mp, (xfs_lsn_t)0,
+                                              XFS_LOG_FORCE);
+                        }
+                        if (dopush) {
+#ifdef XFSRACEDEBUG
+                                delay_for_intr();
+                                delay(300);
+#endif
+                                xfs_bawrite(mp, bp);
+                        } else {
+                                xfs_buf_relse(bp);
+                        }
+                } else {
+                        qip->qli_pushbuf_flag = 0;
+                        xfs_dqunlock(dqp);
+                        xfs_buf_relse(bp);
+                }
+                return;
+        }
+        qip->qli_pushbuf_flag = 0;
+        xfs_dqunlock(dqp);
+}
+/*
+ * This is called to attempt to lock the dquot associated with this
+ * dquot log item.  Don't sleep on the dquot lock or the flush lock.
+ * If the flush lock is already held, indicating that the dquot has
+ * been or is in the process of being flushed, then see if we can
+ * find the dquot's buffer in the buffer cache without sleeping.  If
+ * we can and it is marked delayed write, then we want to send it out.
+ * We delay doing so until the push routine, though, to avoid sleeping
+ * in any device strategy routines.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_trylock(
+        xfs_dq_logitem_t        *qip)
+{
+        xfs_dquot_t             *dqp;
+        uint                    retval;
+        dqp = qip->qli_dquot;
+        if (dqp->q_pincount > 0)
+                return (XFS_ITEM_PINNED);
+        if (! xfs_qm_dqlock_nowait(dqp))
+                return (XFS_ITEM_LOCKED);
+        retval = XFS_ITEM_SUCCESS;
+        if (! xfs_qm_dqflock_nowait(dqp)) {
+                /*
+                 * The dquot is already being flushed.  It may have been
+                 * flushed delayed write, however, and we don't want to
+                 * get stuck waiting for that to complete.  So, we want to check
+                 * to see if we can lock the dquot's buffer without sleeping.
+                 * If we can and it is marked for delayed write, then we
+                 * hold it and send it out from the push routine.  We don't
+                 * want to do that now since we might sleep in the device
+                 * strategy routine.  We also don't want to grab the buffer lock
+                 * here because we'd like not to call into the buffer cache
+                 * while holding the AIL_LOCK.
+                 * Make sure to only return PUSHBUF if we set pushbuf_flag
+                 * ourselves.  If someone else is doing it then we don't
+                 * want to go to the push routine and duplicate their efforts.
+                 */
+                if (qip->qli_pushbuf_flag == 0) {
+                        qip->qli_pushbuf_flag = 1;
+                        ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
+#ifdef DEBUG
+                        qip->qli_push_owner = get_thread_id();
+#endif
+                        /*
+                         * The dquot is left locked.
+                         */
+                        retval = XFS_ITEM_PUSHBUF;
+                } else {
+                        retval = XFS_ITEM_FLUSHING;
+                        xfs_dqunlock_nonotify(dqp);
+                }
+        }
+        ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
+        return (retval);
+}
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+        xfs_dq_logitem_t    *ql)
+{
+        xfs_dquot_t     *dqp;
+        ASSERT(ql != NULL);
+        dqp = ql->qli_dquot;
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        /*
+         * Clear the transaction pointer in the dquot
+         */
+        dqp->q_transp = NULL;
+        /*
+         * dquots are never 'held' from getting unlocked at the end of
+         * a transaction.  Their locking and unlocking is hidden inside the
+         * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+         * for the logitem.
+         */
+        xfs_dqunlock(dqp);
+}
+/*
+ * The transaction with the dquot locked has aborted.  The dquot
+ * must not be dirty within the transaction.  We simply unlock just
+ * as if the transaction had been cancelled.
+ */
+STATIC void
+xfs_qm_dquot_logitem_abort(
+        xfs_dq_logitem_t    *ql)
+{
+        xfs_qm_dquot_logitem_unlock(ql);
+}
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+        xfs_dq_logitem_t        *l,
+        xfs_lsn_t               lsn)
+{
+        return;
+}
+/*
+ * This is the ops vector for dquots
+ */
+struct xfs_item_ops xfs_dquot_item_ops = {
+        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
+        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+                                        xfs_qm_dquot_logitem_format,
+        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+                                        xfs_qm_dquot_logitem_unpin,
+        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+                                        xfs_qm_dquot_logitem_unpin_remove,
+        .iop_trylock    = (uint(*)(xfs_log_item_t*))
+                                        xfs_qm_dquot_logitem_trylock,
+        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
+        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_qm_dquot_logitem_committed,
+        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
+        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
+        .iop_pushbuf    = (void(*)(xfs_log_item_t*))
+                                        xfs_qm_dquot_logitem_pushbuf,
+        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_qm_dquot_logitem_committing
+};
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+        struct xfs_dquot *dqp)
+{
+        xfs_dq_logitem_t  *lp;
+        lp = &dqp->q_logitem;
+        lp->qli_item.li_type = XFS_LI_DQUOT;
+        lp->qli_item.li_ops = &xfs_dquot_item_ops;
+        lp->qli_item.li_mountp = dqp->q_mount;
+        lp->qli_dquot = dqp;
+        lp->qli_format.qlf_type = XFS_LI_DQUOT;
+        lp->qli_format.qlf_id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
+        lp->qli_format.qlf_blkno = dqp->q_blkno;
+        lp->qli_format.qlf_len = 1;
+        /*
+         * This is just the offset of this dquot within its buffer
+         * (which is currently 1 FSB and probably won't change).
+         * Hence 32 bits for this offset should be just fine.
+         * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
+         * here, and recompute it at recovery time.
+         */
+        lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
+}
+/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item.  It just logs the
+ * quotaoff_log_format structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
+{
+        return (1);
+}
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given quotaoff log item. We use only 1 iovec, and we point that
+ * at the quotaoff_log_format structure embedded in the quotaoff item.
+ * It is at this point that we assert that all of the extent
+ * slots in the quotaoff item have been filled.
+ */
+STATIC void
+xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t   *qf,
+                           xfs_log_iovec_t      *log_vector)
+{
+        ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
+        log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
+        log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+        qf->qql_format.qf_size = 1;
+}
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
+{
+        return;
+}
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale)
+{
+        return;
+}
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
+{
+        return;
+}
+/*
+ * Quotaoff items have no locking, so just return success.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
+{
+        return XFS_ITEM_LOCKED;
+}
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
+{
+        return;
+}
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
+{
+        return (lsn);
+}
+/*
+ * The transaction of which this QUOTAOFF is a part has been aborted.
+ * Just clean up after ourselves.
+ * Shouldn't this never happen in the case of qoffend logitems? XXX
+ */
+STATIC void
+xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
+{
+        kmem_free(qf, sizeof(xfs_qoff_logitem_t));
+}
+/*
+ * There isn't much you can do to push on an quotaoff item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
+{
+        return;
+}
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+        xfs_qoff_logitem_t *qfe,
+        xfs_lsn_t lsn)
+{
+        xfs_qoff_logitem_t      *qfs;
+        SPLDECL(s);
+        qfs = qfe->qql_start_lip;
+        AIL_LOCK(qfs->qql_item.li_mountp,s);
+        /*
+         * Delete the qoff-start logitem from the AIL.
+         * xfs_trans_delete_ail() drops the AIL lock.
+         */
+        xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s);
+        kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
+        kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
+        return (xfs_lsn_t)-1;
+}
+/*
+ * XXX rcc - don't know quite what to do with this.  I think we can
+ * just ignore it.  The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen?  Also, do we need different routines for
+ * quotaoff start and quotaoff end?  I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable).  If we do something else,
+ * then maybe we don't need two.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+{
+        return;
+}
+/* ARGSUSED */
+STATIC void
+xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+{
+        return;
+}
+struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+                                        xfs_qm_qoff_logitem_format,
+        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+        .iop_unpin      = (void(*)(xfs_log_item_t* ,int))
+                                        xfs_qm_qoff_logitem_unpin,
+        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+                                        xfs_qm_qoff_logitem_unpin_remove,
+        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_qm_qoffend_logitem_committed,
+        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
+        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
+        .iop_pushbuf    = NULL,
+        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_qm_qoffend_logitem_committing
+};
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+                                        xfs_qm_qoff_logitem_format,
+        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*, int))
+                                        xfs_qm_qoff_logitem_unpin,
+        .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+                                        xfs_qm_qoff_logitem_unpin_remove,
+        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_qm_qoff_logitem_committed,
+        .iop_push       = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
+        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
+        .iop_pushbuf    = NULL,
+        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_qm_qoff_logitem_committing
+};
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+xfs_qoff_logitem_t *
+xfs_qm_qoff_logitem_init(
+        struct xfs_mount *mp,
+        xfs_qoff_logitem_t *start,
+        uint flags)
+{
+        xfs_qoff_logitem_t      *qf;
+        qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
+        qf->qql_item.li_type = XFS_LI_QUOTAOFF;
+        if (start)
+                qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
+        else
+                qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
+        qf->qql_item.li_mountp = mp;
+        qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
+        qf->qql_format.qf_flags = flags;
+        qf->qql_start_lip = start;
+        return (qf);
+}
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
new file mode 100644
index 000000000000..9c6500dabcaa
--- /dev/null
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_qoff_logitem;
+typedef struct xfs_dq_logitem {
+        xfs_log_item_t           qli_item;         /* common portion */
+        struct xfs_dquot        *qli_dquot;        /* dquot ptr */
+        xfs_lsn_t                qli_flush_lsn;    /* lsn at last flush */
+        unsigned short           qli_pushbuf_flag; /* 1 bit used in push_ail */
+#ifdef DEBUG
+        uint64_t                 qli_push_owner;
+#endif
+        xfs_dq_logformat_t       qli_format;       /* logged structure */
+} xfs_dq_logitem_t;
+typedef struct xfs_qoff_logitem {
+        xfs_log_item_t           qql_item;      /* common portion */
+        struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+        xfs_qoff_logformat_t     qql_format;    /* logged structure */
+} xfs_qoff_logitem_t;
+extern void                xfs_qm_dquot_logitem_init(struct xfs_dquot *);
+extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
+                                        struct xfs_qoff_logitem *, uint);
+extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
+                                        struct xfs_qoff_logitem *, uint);
+extern void                xfs_trans_log_quotaoff_item(struct xfs_trans *,
+                                        struct xfs_qoff_logitem *);
+#endif  /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
new file mode 100644
index 000000000000..89f2cd656ebf
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.c
@@ -0,0 +1,2848 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_clnt.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+mutex_t xfs_Gqm_lock;
+struct xfs_qm   *xfs_Gqm;
+kmem_zone_t     *qm_dqzone;
+kmem_zone_t     *qm_dqtrxzone;
+kmem_shaker_t   xfs_qm_shaker;
+STATIC void     xfs_qm_list_init(xfs_dqlist_t *, char *, int);
+STATIC void     xfs_qm_list_destroy(xfs_dqlist_t *);
+STATIC int      xfs_qm_init_quotainos(xfs_mount_t *);
+STATIC int      xfs_qm_shake(int, unsigned int);
+#ifdef DEBUG
+extern mutex_t  qcheck_lock;
+#endif
+#ifdef QUOTADEBUG
+#define XQM_LIST_PRINT(l, NXT, title) \
+{ \
+        xfs_dquot_t     *dqp; int i = 0; \
+        cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+        for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
+                cmn_err(CE_DEBUG, "   %d.  \"%d (%s)\"   " \
+                                  "bcnt = %d, icnt = %d, refs = %d", \
+                        ++i, (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT), \
+                        DQFLAGTO_TYPESTR(dqp),       \
+                        (int) INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), \
+                        (int) INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), \
+                        (int) dqp->q_nrefs);  } \
+}
+#else
+#define XQM_LIST_PRINT(l, NXT, title) do { } while (0)
+#endif
+/*
+ * Initialize the XQM structure.
+ * Note that there is not one quota manager per file system.
+ */
+STATIC struct xfs_qm *
+xfs_Gqm_init(void)
+{
+        xfs_qm_t                *xqm;
+        int                     hsize, i;
+        xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
+        ASSERT(xqm);
+        /*
+         * Initialize the dquot hash tables.
+         */
+        hsize = (DQUOT_HASH_HEURISTIC < XFS_QM_NCSIZE_THRESHOLD) ?
+                XFS_QM_HASHSIZE_LOW : XFS_QM_HASHSIZE_HIGH;
+        xqm->qm_dqhashmask = hsize - 1;
+        xqm->qm_usr_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
+                                                      sizeof(xfs_dqhash_t),
+                                                      KM_SLEEP);
+        xqm->qm_grp_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
+                                                      sizeof(xfs_dqhash_t),
+                                                      KM_SLEEP);
+        ASSERT(xqm->qm_usr_dqhtable != NULL);
+        ASSERT(xqm->qm_grp_dqhtable != NULL);
+        for (i = 0; i < hsize; i++) {
+                xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
+                xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+        }
+        /*
+         * Freelist of all dquots of all file systems
+         */
+        xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+        /*
+         * dquot zone. we register our own low-memory callback.
+         */
+        if (!qm_dqzone) {
+                xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
+                                                "xfs_dquots");
+                qm_dqzone = xqm->qm_dqzone;
+        } else
+                xqm->qm_dqzone = qm_dqzone;
+        xfs_qm_shaker = kmem_shake_register(xfs_qm_shake);
+        /*
+         * The t_dqinfo portion of transactions.
+         */
+        if (!qm_dqtrxzone) {
+                xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
+                                                   "xfs_dqtrx");
+                qm_dqtrxzone = xqm->qm_dqtrxzone;
+        } else
+                xqm->qm_dqtrxzone = qm_dqtrxzone;
+        atomic_set(&xqm->qm_totaldquots, 0);
+        xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
+        xqm->qm_nrefs = 0;
+#ifdef DEBUG
+        mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk");
+#endif
+        return xqm;
+}
+/*
+ * Destroy the global quota manager when its reference count goes to zero.
+ */
+void
+xfs_qm_destroy(
+        struct xfs_qm   *xqm)
+{
+        int             hsize, i;
+        ASSERT(xqm != NULL);
+        ASSERT(xqm->qm_nrefs == 0);
+        kmem_shake_deregister(xfs_qm_shaker);
+        hsize = xqm->qm_dqhashmask + 1;
+        for (i = 0; i < hsize; i++) {
+                xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
+                xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+        }
+        kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
+        kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
+        xqm->qm_usr_dqhtable = NULL;
+        xqm->qm_grp_dqhtable = NULL;
+        xqm->qm_dqhashmask = 0;
+        xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+#ifdef DEBUG
+        mutex_destroy(&qcheck_lock);
+#endif
+        kmem_free(xqm, sizeof(xfs_qm_t));
+}
+/*
+ * Called at mount time to let XQM know that another file system is
+ * starting quotas. This isn't crucial information as the individual mount
+ * structures are pretty independent, but it helps the XQM keep a
+ * global view of what's going on.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_hold_quotafs_ref(
+        struct xfs_mount *mp)
+{
+        /*
+         * Need to lock the xfs_Gqm structure for things like this. For example,
+         * the structure could disappear between the entry to this routine and
+         * a HOLD operation if not locked.
+         */
+        XFS_QM_LOCK(xfs_Gqm);
+        if (xfs_Gqm == NULL)
+                xfs_Gqm = xfs_Gqm_init();
+        /*
+         * We can keep a list of all filesystems with quotas mounted for
+         * debugging and statistical purposes, but ...
+         * Just take a reference and get out.
+         */
+        XFS_QM_HOLD(xfs_Gqm);
+        XFS_QM_UNLOCK(xfs_Gqm);
+        return 0;
+}
+/*
+ * Release the reference that a filesystem took at mount time,
+ * so that we know when we need to destroy the entire quota manager.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_rele_quotafs_ref(
+        struct xfs_mount *mp)
+{
+        xfs_dquot_t     *dqp, *nextdqp;
+        ASSERT(xfs_Gqm);
+        ASSERT(xfs_Gqm->qm_nrefs > 0);
+        /*
+         * Go thru the freelist and destroy all inactive dquots.
+         */
+        xfs_qm_freelist_lock(xfs_Gqm);
+        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+             dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
+                xfs_dqlock(dqp);
+                nextdqp = dqp->dq_flnext;
+                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+                        ASSERT(dqp->q_mount == NULL);
+                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(dqp->MPL_PREVP == NULL);
+                        XQM_FREELIST_REMOVE(dqp);
+                        xfs_dqunlock(dqp);
+                        xfs_qm_dqdestroy(dqp);
+                } else {
+                        xfs_dqunlock(dqp);
+                }
+                dqp = nextdqp;
+        }
+        xfs_qm_freelist_unlock(xfs_Gqm);
+        /*
+         * Destroy the entire XQM. If somebody mounts with quotaon, this'll
+         * be restarted.
+         */
+        XFS_QM_LOCK(xfs_Gqm);
+        XFS_QM_RELE(xfs_Gqm);
+        if (xfs_Gqm->qm_nrefs == 0) {
+                xfs_qm_destroy(xfs_Gqm);
+                xfs_Gqm = NULL;
+        }
+        XFS_QM_UNLOCK(xfs_Gqm);
+}
+/*
+ * This is called at mount time from xfs_mountfs to initialize the quotainfo
+ * structure and start the global quotamanager (xfs_Gqm) if it hasn't done
+ * so already.  Note that the superblock has not been read in yet.
+ */
+void
+xfs_qm_mount_quotainit(
+        xfs_mount_t     *mp,
+        uint            flags)
+{
+        /*
+         * User or group quotas has to be on.
+         */
+        ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA));
+        /*
+         * Initialize the flags in the mount structure. From this point
+         * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc.
+         * Note that we enforce nothing if accounting is off.
+         * ie.  XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF.
+         * It isn't necessary to take the quotaoff lock to do this; this is
+         * called from mount.
+         */
+        if (flags & XFSMNT_UQUOTA) {
+                mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+                if (flags & XFSMNT_UQUOTAENF)
+                        mp->m_qflags |= XFS_UQUOTA_ENFD;
+        }
+        if (flags & XFSMNT_GQUOTA) {
+                mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+                if (flags & XFSMNT_GQUOTAENF)
+                        mp->m_qflags |= XFS_GQUOTA_ENFD;
+        }
+}
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount_quotadestroy(
+        xfs_mount_t     *mp)
+{
+        if (mp->m_quotainfo)
+                xfs_qm_destroy_quotainfo(mp);
+}
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ */
+int
+xfs_qm_mount_quotas(
+        xfs_mount_t     *mp,
+        int             mfsi_flags)
+{
+        unsigned long   s;
+        int             error = 0;
+        uint            sbf;
+        /*
+         * If a file system had quotas running earlier, but decided to
+         * mount without -o quota/uquota/gquota options, revoke the
+         * quotachecked license, and bail out.
+         */
+        if (! XFS_IS_QUOTA_ON(mp) &&
+            (mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) {
+                mp->m_qflags = 0;
+                goto write_changes;
+        }
+        /*
+         * If quotas on realtime volumes is not supported, we disable
+         * quotas immediately.
+         */
+        if (mp->m_sb.sb_rextents) {
+                cmn_err(CE_NOTE,
+                        "Cannot turn on quotas for realtime filesystem %s",
+                        mp->m_fsname);
+                mp->m_qflags = 0;
+                goto write_changes;
+        }
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+        cmn_err(CE_NOTE, "Attempting to turn on disk quotas.");
+#endif
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        /*
+         * Allocate the quotainfo structure inside the mount struct, and
+         * create quotainode(s), and change/rev superblock if necessary.
+         */
+        if ((error = xfs_qm_init_quotainfo(mp))) {
+                /*
+                 * We must turn off quotas.
+                 */
+                ASSERT(mp->m_quotainfo == NULL);
+                mp->m_qflags = 0;
+                goto write_changes;
+        }
+        /*
+         * If any of the quotas are not consistent, do a quotacheck.
+         */
+        if (XFS_QM_NEED_QUOTACHECK(mp) &&
+                !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+#ifdef DEBUG
+                cmn_err(CE_NOTE, "Doing a quotacheck. Please wait.");
+#endif
+                if ((error = xfs_qm_quotacheck(mp))) {
+                        /* Quotacheck has failed and quotas have
+                         * been disabled.
+                         */
+                        return XFS_ERROR(error);
+                }
+#ifdef DEBUG
+                cmn_err(CE_NOTE, "Done quotacheck.");
+#endif
+        }
+ write_changes:
+        /*
+         * We actually don't have to acquire the SB_LOCK at all.
+         * This can only be called from mount, and that's single threaded. XXX
+         */
+        s = XFS_SB_LOCK(mp);
+        sbf = mp->m_sb.sb_qflags;
+        mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+        XFS_SB_UNLOCK(mp, s);
+        if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+                if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+                        /*
+                         * We could only have been turning quotas off.
+                         * We aren't in very good shape actually because
+                         * the incore structures are convinced that quotas are
+                         * off, but the on disk superblock doesn't know that !
+                         */
+                        ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                "XFS mount_quotas: Superblock update failed!");
+                }
+        }
+        if (error) {
+                xfs_fs_cmn_err(CE_WARN, mp,
+                        "Failed to initialize disk quotas.");
+        }
+        return XFS_ERROR(error);
+}
+/*
+ * Called from the vfsops layer.
+ */
+int
+xfs_qm_unmount_quotas(
+        xfs_mount_t     *mp)
+{
+        xfs_inode_t     *uqp, *gqp;
+        int             error = 0;
+        /*
+         * Release the dquots that root inode, et al might be holding,
+         * before we flush quotas and blow away the quotainfo structure.
+         */
+        ASSERT(mp->m_rootip);
+        xfs_qm_dqdetach(mp->m_rootip);
+        if (mp->m_rbmip)
+                xfs_qm_dqdetach(mp->m_rbmip);
+        if (mp->m_rsumip)
+                xfs_qm_dqdetach(mp->m_rsumip);
+        /*
+         * Flush out the quota inodes.
+         */
+        uqp = gqp = NULL;
+        if (mp->m_quotainfo) {
+                if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
+                        xfs_ilock(uqp, XFS_ILOCK_EXCL);
+                        xfs_iflock(uqp);
+                        error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
+                        xfs_iunlock(uqp, XFS_ILOCK_EXCL);
+                        if (unlikely(error == EFSCORRUPTED)) {
+                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
+                                                 XFS_ERRLEVEL_LOW, mp);
+                                goto out;
+                        }
+                }
+                if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
+                        xfs_ilock(gqp, XFS_ILOCK_EXCL);
+                        xfs_iflock(gqp);
+                        error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
+                        xfs_iunlock(gqp, XFS_ILOCK_EXCL);
+                        if (unlikely(error == EFSCORRUPTED)) {
+                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
+                                                 XFS_ERRLEVEL_LOW, mp);
+                                goto out;
+                        }
+                }
+        }
+        if (uqp) {
+                 XFS_PURGE_INODE(uqp);
+                 mp->m_quotainfo->qi_uquotaip = NULL;
+        }
+        if (gqp) {
+                XFS_PURGE_INODE(gqp);
+                mp->m_quotainfo->qi_gquotaip = NULL;
+        }
+out:
+        return XFS_ERROR(error);
+}
+/*
+ * Flush all dquots of the given file system to disk. The dquots are
+ * _not_ purged from memory here, just their data written to disk.
+ */
+int
+xfs_qm_dqflush_all(
+        xfs_mount_t     *mp,
+        int             flags)
+{
+        int             recl;
+        xfs_dquot_t     *dqp;
+        int             niters;
+        int             error;
+        if (mp->m_quotainfo == NULL)
+                return (0);
+        niters = 0;
+again:
+        xfs_qm_mplist_lock(mp);
+        FOREACH_DQUOT_IN_MP(dqp, mp) {
+                xfs_dqlock(dqp);
+                if (! XFS_DQ_IS_DIRTY(dqp)) {
+                        xfs_dqunlock(dqp);
+                        continue;
+                }
+                xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
+                /* XXX a sentinel would be better */
+                recl = XFS_QI_MPLRECLAIMS(mp);
+                if (! xfs_qm_dqflock_nowait(dqp)) {
+                        /*
+                         * If we can't grab the flush lock then check
+                         * to see if the dquot has been flushed delayed
+                         * write.  If so, grab its buffer and send it
+                         * out immediately.  We'll be able to acquire
+                         * the flush lock when the I/O completes.
+                         */
+                        xfs_qm_dqflock_pushbuf_wait(dqp);
+                }
+                /*
+                 * Let go of the mplist lock. We don't want to hold it
+                 * across a disk write.
+                 */
+                xfs_qm_mplist_unlock(mp);
+                error = xfs_qm_dqflush(dqp, flags);
+                xfs_dqunlock(dqp);
+                if (error)
+                        return (error);
+                xfs_qm_mplist_lock(mp);
+                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                        xfs_qm_mplist_unlock(mp);
+                        /* XXX restart limit */
+                        goto again;
+                }
+        }
+        xfs_qm_mplist_unlock(mp);
+        /* return ! busy */
+        return (0);
+}
+/*
+ * Release the group dquot pointers the user dquots may be
+ * carrying around as a hint. mplist is locked on entry and exit.
+ */
+STATIC void
+xfs_qm_detach_gdquots(
+        xfs_mount_t     *mp)
+{
+        xfs_dquot_t     *dqp, *gdqp;
+        int             nrecl;
+ again:
+        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        dqp = XFS_QI_MPLNEXT(mp);
+        while (dqp) {
+                xfs_dqlock(dqp);
+                if ((gdqp = dqp->q_gdquot)) {
+                        xfs_dqlock(gdqp);
+                        dqp->q_gdquot = NULL;
+                }
+                xfs_dqunlock(dqp);
+                if (gdqp) {
+                        /*
+                         * Can't hold the mplist lock across a dqput.
+                         * XXXmust convert to marker based iterations here.
+                         */
+                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        xfs_qm_mplist_unlock(mp);
+                        xfs_qm_dqput(gdqp);
+                        xfs_qm_mplist_lock(mp);
+                        if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+                                goto again;
+                }
+                dqp = dqp->MPL_NEXT;
+        }
+}
+/*
+ * Go through all the incore dquots of this file system and take them
+ * off the mplist and hashlist, if the dquot type matches the dqtype
+ * parameter. This is used when turning off quota accounting for
+ * users and/or groups, as well as when the filesystem is unmounting.
+ */
+STATIC int
+xfs_qm_dqpurge_int(
+        xfs_mount_t     *mp,
+        uint            flags) /* QUOTAOFF/UMOUNTING/UQUOTA/GQUOTA */
+{
+        xfs_dquot_t     *dqp;
+        uint            dqtype;
+        int             nrecl;
+        xfs_dquot_t     *nextdqp;
+        int             nmisses;
+        if (mp->m_quotainfo == NULL)
+                return (0);
+        dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
+        dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
+        xfs_qm_mplist_lock(mp);
+        /*
+         * In the first pass through all incore dquots of this filesystem,
+         * we release the group dquot pointers the user dquots may be
+         * carrying around as a hint. We need to do this irrespective of
+         * what's being turned off.
+         */
+        xfs_qm_detach_gdquots(mp);
+      again:
+        nmisses = 0;
+        ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+        /*
+         * Try to get rid of all of the unwanted dquots. The idea is to
+         * get them off mplist and hashlist, but leave them on freelist.
+         */
+        dqp = XFS_QI_MPLNEXT(mp);
+        while (dqp) {
+                /*
+                 * It's OK to look at the type without taking dqlock here.
+                 * We're holding the mplist lock here, and that's needed for
+                 * a dqreclaim.
+                 */
+                if ((dqp->dq_flags & dqtype) == 0) {
+                        dqp = dqp->MPL_NEXT;
+                        continue;
+                }
+                if (! xfs_qm_dqhashlock_nowait(dqp)) {
+                        nrecl = XFS_QI_MPLRECLAIMS(mp);
+                        xfs_qm_mplist_unlock(mp);
+                        XFS_DQ_HASH_LOCK(dqp->q_hash);
+                        xfs_qm_mplist_lock(mp);
+                        /*
+                         * XXXTheoretically, we can get into a very long
+                         * ping pong game here.
+                         * No one can be adding dquots to the mplist at
+                         * this point, but somebody might be taking things off.
+                         */
+                        if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+                                XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+                                goto again;
+                        }
+                }
+                /*
+                 * Take the dquot off the mplist and hashlist. It may remain on
+                 * freelist in INACTIVE state.
+                 */
+                nextdqp = dqp->MPL_NEXT;
+                nmisses += xfs_qm_dqpurge(dqp, flags);
+                dqp = nextdqp;
+        }
+        xfs_qm_mplist_unlock(mp);
+        return nmisses;
+}
+int
+xfs_qm_dqpurge_all(
+        xfs_mount_t     *mp,
+        uint            flags)
+{
+        int             ndquots;
+        /*
+         * Purge the dquot cache.
+         * None of the dquots should really be busy at this point.
+         */
+        if (mp->m_quotainfo) {
+                while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
+                        delay(ndquots * 10);
+                }
+        }
+        return 0;
+}
+STATIC int
+xfs_qm_dqattach_one(
+        xfs_inode_t     *ip,
+        xfs_dqid_t      id,
+        uint            type,
+        uint            doalloc,
+        uint            dolock,
+        xfs_dquot_t     *udqhint, /* hint */
+        xfs_dquot_t     **IO_idqpp)
+{
+        xfs_dquot_t     *dqp;
+        int             error;
+        ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+        error = 0;
+        /*
+         * See if we already have it in the inode itself. IO_idqpp is
+         * &i_udquot or &i_gdquot. This made the code look weird, but
+         * made the logic a lot simpler.
+         */
+        if ((dqp = *IO_idqpp)) {
+                if (dolock)
+                        xfs_dqlock(dqp);
+                xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
+                goto done;
+        }
+        /*
+         * udqhint is the i_udquot field in inode, and is non-NULL only
+         * when the type arg is XFS_DQ_GROUP. Its purpose is to save a
+         * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
+         * the user dquot.
+         */
+        ASSERT(!udqhint || type == XFS_DQ_GROUP);
+        if (udqhint && !dolock)
+                xfs_dqlock(udqhint);
+        /*
+         * No need to take dqlock to look at the id.
+         * The ID can't change until it gets reclaimed, and it won't
+         * be reclaimed as long as we have a ref from inode and we hold
+         * the ilock.
+         */
+        if (udqhint &&
+            (dqp = udqhint->q_gdquot) &&
+            (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id)) {
+                ASSERT(XFS_DQ_IS_LOCKED(udqhint));
+                xfs_dqlock(dqp);
+                XFS_DQHOLD(dqp);
+                ASSERT(*IO_idqpp == NULL);
+                *IO_idqpp = dqp;
+                if (!dolock) {
+                        xfs_dqunlock(dqp);
+                        xfs_dqunlock(udqhint);
+                }
+                goto done;
+        }
+        /*
+         * We can't hold a dquot lock when we call the dqget code.
+         * We'll deadlock in no time, because of (not conforming to)
+         * lock ordering - the inodelock comes before any dquot lock,
+         * and we may drop and reacquire the ilock in xfs_qm_dqget().
+         */
+        if (udqhint)
+                xfs_dqunlock(udqhint);
+        /*
+         * Find the dquot from somewhere. This bumps the
+         * reference count of dquot and returns it locked.
+         * This can return ENOENT if dquot didn't exist on
+         * disk and we didn't ask it to allocate;
+         * ESRCH if quotas got turned off suddenly.
+         */
+        if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+                                 doalloc|XFS_QMOPT_DOWARN, &dqp))) {
+                if (udqhint && dolock)
+                        xfs_dqlock(udqhint);
+                goto done;
+        }
+        xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+        /*
+         * dqget may have dropped and re-acquired the ilock, but it guarantees
+         * that the dquot returned is the one that should go in the inode.
+         */
+        *IO_idqpp = dqp;
+        ASSERT(dqp);
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        if (! dolock) {
+                xfs_dqunlock(dqp);
+                goto done;
+        }
+        if (! udqhint)
+                goto done;
+        ASSERT(udqhint);
+        ASSERT(dolock);
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        if (! xfs_qm_dqlock_nowait(udqhint)) {
+                xfs_dqunlock(dqp);
+                xfs_dqlock(udqhint);
+                xfs_dqlock(dqp);
+        }
+      done:
+#ifdef QUOTADEBUG
+        if (udqhint) {
+                if (dolock)
+                        ASSERT(XFS_DQ_IS_LOCKED(udqhint));
+        }
+        if (! error) {
+                if (dolock)
+                        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        }
+#endif
+        return (error);
+}
+/*
+ * Given a udquot and gdquot, attach a ptr to the group dquot in the
+ * udquot as a hint for future lookups. The idea sounds simple, but the
+ * execution isn't, because the udquot might have a group dquot attached
+ * already and getting rid of that gets us into lock ordering contraints.
+ * The process is complicated more by the fact that the dquots may or may not
+ * be locked on entry.
+ */
+STATIC void
+xfs_qm_dqattach_grouphint(
+        xfs_dquot_t     *udq,
+        xfs_dquot_t     *gdq,
+        uint            locked)
+{
+        xfs_dquot_t     *tmp;
+#ifdef QUOTADEBUG
+        if (locked) {
+                ASSERT(XFS_DQ_IS_LOCKED(udq));
+                ASSERT(XFS_DQ_IS_LOCKED(gdq));
+        }
+#endif
+        if (! locked)
+                xfs_dqlock(udq);
+        if ((tmp = udq->q_gdquot)) {
+                if (tmp == gdq) {
+                        if (! locked)
+                                xfs_dqunlock(udq);
+                        return;
+                }
+                udq->q_gdquot = NULL;
+                /*
+                 * We can't keep any dqlocks when calling dqrele,
+                 * because the freelist lock comes before dqlocks.
+                 */
+                xfs_dqunlock(udq);
+                if (locked)
+                        xfs_dqunlock(gdq);
+                /*
+                 * we took a hard reference once upon a time in dqget,
+                 * so give it back when the udquot no longer points at it
+                 * dqput() does the unlocking of the dquot.
+                 */
+                xfs_qm_dqrele(tmp);
+                xfs_dqlock(udq);
+                xfs_dqlock(gdq);
+        } else {
+                ASSERT(XFS_DQ_IS_LOCKED(udq));
+                if (! locked) {
+                        xfs_dqlock(gdq);
+                }
+        }
+        ASSERT(XFS_DQ_IS_LOCKED(udq));
+        ASSERT(XFS_DQ_IS_LOCKED(gdq));
+        /*
+         * Somebody could have attached a gdquot here,
+         * when we dropped the uqlock. If so, just do nothing.
+         */
+        if (udq->q_gdquot == NULL) {
+                XFS_DQHOLD(gdq);
+                udq->q_gdquot = gdq;
+        }
+        if (! locked) {
+                xfs_dqunlock(gdq);
+                xfs_dqunlock(udq);
+        }
+}
+/*
+ * Given a locked inode, attach dquot(s) to it, taking UQUOTAON / GQUOTAON
+ * in to account.
+ * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
+ * much made this code a complete mess, but it has been pretty useful.
+ * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach(
+        xfs_inode_t     *ip,
+        uint            flags)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        uint            nquotas = 0;
+        int             error = 0;
+        if ((! XFS_IS_QUOTA_ON(mp)) ||
+            (! XFS_NOT_DQATTACHED(mp, ip)) ||
+            (ip->i_ino == mp->m_sb.sb_uquotino) ||
+            (ip->i_ino == mp->m_sb.sb_gquotino))
+                return (0);
+        ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
+               XFS_ISLOCKED_INODE_EXCL(ip));
+        if (! (flags & XFS_QMOPT_ILOCKED))
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        if (XFS_IS_UQUOTA_ON(mp)) {
+                error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+                                                flags & XFS_QMOPT_DQALLOC,
+                                                flags & XFS_QMOPT_DQLOCK,
+                                                NULL, &ip->i_udquot);
+                if (error)
+                        goto done;
+                nquotas++;
+        }
+        ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+        if (XFS_IS_GQUOTA_ON(mp)) {
+                error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+                                                flags & XFS_QMOPT_DQALLOC,
+                                                flags & XFS_QMOPT_DQLOCK,
+                                                ip->i_udquot, &ip->i_gdquot);
+                /*
+                 * Don't worry about the udquot that we may have
+                 * attached above. It'll get detached, if not already.
+                 */
+                if (error)
+                        goto done;
+                nquotas++;
+        }
+        /*
+         * Attach this group quota to the user quota as a hint.
+         * This WON'T, in general, result in a thrash.
+         */
+        if (nquotas == 2) {
+                ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+                ASSERT(ip->i_udquot);
+                ASSERT(ip->i_gdquot);
+                /*
+                 * We may or may not have the i_udquot locked at this point,
+                 * but this check is OK since we don't depend on the i_gdquot to
+                 * be accurate 100% all the time. It is just a hint, and this
+                 * will succeed in general.
+                 */
+                if (ip->i_udquot->q_gdquot == ip->i_gdquot)
+                        goto done;
+                /*
+                 * Attach i_gdquot to the gdquot hint inside the i_udquot.
+                 */
+                xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
+                                         flags & XFS_QMOPT_DQLOCK);
+        }
+      done:
+#ifdef QUOTADEBUG
+        if (! error) {
+                if (ip->i_udquot) {
+                        if (flags & XFS_QMOPT_DQLOCK)
+                                ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
+                }
+                if (ip->i_gdquot) {
+                        if (flags & XFS_QMOPT_DQLOCK)
+                                ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
+                }
+                if (XFS_IS_UQUOTA_ON(mp))
+                        ASSERT(ip->i_udquot);
+                if (XFS_IS_GQUOTA_ON(mp))
+                        ASSERT(ip->i_gdquot);
+        }
+#endif
+        if (! (flags & XFS_QMOPT_ILOCKED))
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+#ifdef QUOTADEBUG
+        else
+                ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+#endif
+        return (error);
+}
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdetach(
+        xfs_inode_t     *ip)
+{
+        if (!(ip->i_udquot || ip->i_gdquot))
+                return;
+        ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
+        ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+        if (ip->i_udquot)
+                xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
+        if (ip->i_udquot) {
+                xfs_qm_dqrele(ip->i_udquot);
+                ip->i_udquot = NULL;
+        }
+        if (ip->i_gdquot) {
+                xfs_qm_dqrele(ip->i_gdquot);
+                ip->i_gdquot = NULL;
+        }
+}
+/*
+ * This is called by VFS_SYNC and flags arg determines the caller,
+ * and its motives, as done in xfs_sync.
+ *
+ * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
+ * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
+ * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
+ */
+int
+xfs_qm_sync(
+        xfs_mount_t     *mp,
+        short           flags)
+{
+        int             recl, restarts;
+        xfs_dquot_t     *dqp;
+        uint            flush_flags;
+        boolean_t       nowait;
+        int             error;
+        restarts = 0;
+        /*
+         * We won't block unless we are asked to.
+         */
+        nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
+  again:
+        xfs_qm_mplist_lock(mp);
+        /*
+         * dqpurge_all() also takes the mplist lock and iterate thru all dquots
+         * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
+         * when we have the mplist lock, we know that dquots will be consistent
+         * as long as we have it locked.
+         */
+        if (! XFS_IS_QUOTA_ON(mp)) {
+                xfs_qm_mplist_unlock(mp);
+                return (0);
+        }
+        FOREACH_DQUOT_IN_MP(dqp, mp) {
+                /*
+                 * If this is vfs_sync calling, then skip the dquots that
+                 * don't 'seem' to be dirty. ie. don't acquire dqlock.
+                 * This is very similar to what xfs_sync does with inodes.
+                 */
+                if (flags & SYNC_BDFLUSH) {
+                        if (! XFS_DQ_IS_DIRTY(dqp))
+                                continue;
+                }
+                if (nowait) {
+                        /*
+                         * Try to acquire the dquot lock. We are NOT out of
+                         * lock order, but we just don't want to wait for this
+                         * lock, unless somebody wanted us to.
+                         */
+                        if (! xfs_qm_dqlock_nowait(dqp))
+                                continue;
+                } else {
+                        xfs_dqlock(dqp);
+                }
+                /*
+                 * Now, find out for sure if this dquot is dirty or not.
+                 */
+                if (! XFS_DQ_IS_DIRTY(dqp)) {
+                        xfs_dqunlock(dqp);
+                        continue;
+                }
+                /* XXX a sentinel would be better */
+                recl = XFS_QI_MPLRECLAIMS(mp);
+                if (! xfs_qm_dqflock_nowait(dqp)) {
+                        if (nowait) {
+                                xfs_dqunlock(dqp);
+                                continue;
+                        }
+                        /*
+                         * If we can't grab the flush lock then if the caller
+                         * really wanted us to give this our best shot,
+                         * see if we can give a push to the buffer before we wait
+                         * on the flush lock. At this point, we know that
+                         * eventhough the dquot is being flushed,
+                         * it has (new) dirty data.
+                         */
+                        xfs_qm_dqflock_pushbuf_wait(dqp);
+                }
+                /*
+                 * Let go of the mplist lock. We don't want to hold it
+                 * across a disk write
+                 */
+                flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
+                xfs_qm_mplist_unlock(mp);
+                xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
+                error = xfs_qm_dqflush(dqp, flush_flags);
+                xfs_dqunlock(dqp);
+                if (error && XFS_FORCED_SHUTDOWN(mp))
+                        return(0);      /* Need to prevent umount failure */
+                else if (error)
+                        return (error);
+                xfs_qm_mplist_lock(mp);
+                if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+                        if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
+                                break;
+                        xfs_qm_mplist_unlock(mp);
+                        goto again;
+                }
+        }
+        xfs_qm_mplist_unlock(mp);
+        return (0);
+}
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+int
+xfs_qm_init_quotainfo(
+        xfs_mount_t     *mp)
+{
+        xfs_quotainfo_t *qinf;
+        int             error;
+        xfs_dquot_t     *dqp;
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        /*
+         * Tell XQM that we exist as soon as possible.
+         */
+        if ((error = xfs_qm_hold_quotafs_ref(mp))) {
+                return (error);
+        }
+        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+        /*
+         * See if quotainodes are setup, and if not, allocate them,
+         * and change the superblock accordingly.
+         */
+        if ((error = xfs_qm_init_quotainos(mp))) {
+                kmem_free(qinf, sizeof(xfs_quotainfo_t));
+                mp->m_quotainfo = NULL;
+                return (error);
+        }
+        spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin");
+        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+        qinf->qi_dqreclaims = 0;
+        /* mutex used to serialize quotaoffs */
+        mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff");
+        /* Precalc some constants */
+        qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+        ASSERT(qinf->qi_dqchunklen);
+        qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
+        do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+        mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+        /*
+         * We try to get the limits from the superuser's limits fields.
+         * This is quite hacky, but it is standard quota practice.
+         * We look at the USR dquot with id == 0 first, but if user quotas
+         * are not enabled we goto the GRP dquot with id == 0.
+         * We don't really care to keep separate default limits for user
+         * and group quotas, at least not at this point.
+         */
+        error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
+                             (XFS_IS_UQUOTA_RUNNING(mp)) ?
+                             XFS_DQ_USER : XFS_DQ_GROUP,
+                             XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
+                             &dqp);
+        if (! error) {
+                xfs_disk_dquot_t        *ddqp = &dqp->q_core;
+                /*
+                 * The warnings and timers set the grace period given to
+                 * a user or group before he or she can not perform any
+                 * more writing. If it is zero, a default is used.
+                 */
+                qinf->qi_btimelimit =
+                                INT_GET(ddqp->d_btimer, ARCH_CONVERT) ?
+                                INT_GET(ddqp->d_btimer, ARCH_CONVERT) :
+                                XFS_QM_BTIMELIMIT;
+                qinf->qi_itimelimit =
+                                INT_GET(ddqp->d_itimer, ARCH_CONVERT) ?
+                                INT_GET(ddqp->d_itimer, ARCH_CONVERT) :
+                                XFS_QM_ITIMELIMIT;
+                qinf->qi_rtbtimelimit =
+                                INT_GET(ddqp->d_rtbtimer, ARCH_CONVERT) ?
+                                INT_GET(ddqp->d_rtbtimer, ARCH_CONVERT) :
+                                XFS_QM_RTBTIMELIMIT;
+                qinf->qi_bwarnlimit =
+                                INT_GET(ddqp->d_bwarns, ARCH_CONVERT) ?
+                                INT_GET(ddqp->d_bwarns, ARCH_CONVERT) :
+                                XFS_QM_BWARNLIMIT;
+                qinf->qi_iwarnlimit =
+                                INT_GET(ddqp->d_iwarns, ARCH_CONVERT) ?
+                                INT_GET(ddqp->d_iwarns, ARCH_CONVERT) :
+                                XFS_QM_IWARNLIMIT;
+                qinf->qi_bhardlimit =
+                                INT_GET(ddqp->d_blk_hardlimit, ARCH_CONVERT);
+                qinf->qi_bsoftlimit =
+                                INT_GET(ddqp->d_blk_softlimit, ARCH_CONVERT);
+                qinf->qi_ihardlimit =
+                                INT_GET(ddqp->d_ino_hardlimit, ARCH_CONVERT);
+                qinf->qi_isoftlimit =
+                                INT_GET(ddqp->d_ino_softlimit, ARCH_CONVERT);
+                qinf->qi_rtbhardlimit =
+                                INT_GET(ddqp->d_rtb_hardlimit, ARCH_CONVERT);
+                qinf->qi_rtbsoftlimit =
+                                INT_GET(ddqp->d_rtb_softlimit, ARCH_CONVERT);
+ 
+                /*
+                 * We sent the XFS_QMOPT_DQSUSER flag to dqget because
+                 * we don't want this dquot cached. We haven't done a
+                 * quotacheck yet, and quotacheck doesn't like incore dquots.
+                 */
+                xfs_qm_dqdestroy(dqp);
+        } else {
+                qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+                qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+                qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+                qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+                qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+        }
+        return (0);
+}
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+        xfs_mount_t     *mp)
+{
+        xfs_quotainfo_t *qi;
+        qi = mp->m_quotainfo;
+        ASSERT(qi != NULL);
+        ASSERT(xfs_Gqm != NULL);
+        /*
+         * Release the reference that XQM kept, so that we know
+         * when the XQM structure should be freed. We cannot assume
+         * that xfs_Gqm is non-null after this point.
+         */
+        xfs_qm_rele_quotafs_ref(mp);
+        spinlock_destroy(&qi->qi_pinlock);
+        xfs_qm_list_destroy(&qi->qi_dqlist);
+        if (qi->qi_uquotaip) {
+                XFS_PURGE_INODE(qi->qi_uquotaip);
+                qi->qi_uquotaip = NULL; /* paranoia */
+        }
+        if (qi->qi_gquotaip) {
+                XFS_PURGE_INODE(qi->qi_gquotaip);
+                qi->qi_gquotaip = NULL;
+        }
+        mutex_destroy(&qi->qi_quotaofflock);
+        kmem_free(qi, sizeof(xfs_quotainfo_t));
+        mp->m_quotainfo = NULL;
+}
+/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
+/* ARGSUSED */
+STATIC void
+xfs_qm_list_init(
+        xfs_dqlist_t    *list,
+        char            *str,
+        int             n)
+{
+        mutex_init(&list->qh_lock, MUTEX_DEFAULT, str);
+        list->qh_next = NULL;
+        list->qh_version = 0;
+        list->qh_nelems = 0;
+}
+STATIC void
+xfs_qm_list_destroy(
+        xfs_dqlist_t    *list)
+{
+        mutex_destroy(&(list->qh_lock));
+}
+/*
+ * Stripped down version of dqattach. This doesn't attach, or even look at the
+ * dquots attached to the inode. The rationale is that there won't be any
+ * attached at the time this is called from quotacheck.
+ */
+STATIC int
+xfs_qm_dqget_noattach(
+        xfs_inode_t     *ip,
+        xfs_dquot_t     **O_udqpp,
+        xfs_dquot_t     **O_gdqpp)
+{
+        int             error;
+        xfs_mount_t     *mp;
+        xfs_dquot_t     *udqp, *gdqp;
+        ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+        mp = ip->i_mount;
+        udqp = NULL;
+        gdqp = NULL;
+        if (XFS_IS_UQUOTA_ON(mp)) {
+                ASSERT(ip->i_udquot == NULL);
+                /*
+                 * We want the dquot allocated if it doesn't exist.
+                 */
+                if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
+                                         XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
+                                         &udqp))) {
+                        /*
+                         * Shouldn't be able to turn off quotas here.
+                         */
+                        ASSERT(error != ESRCH);
+                        ASSERT(error != ENOENT);
+                        return (error);
+                }
+                ASSERT(udqp);
+        }
+        if (XFS_IS_GQUOTA_ON(mp)) {
+                ASSERT(ip->i_gdquot == NULL);
+                if (udqp)
+                        xfs_dqunlock(udqp);
+                if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+                                         XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
+                                         &gdqp))) {
+                        if (udqp)
+                                xfs_qm_dqrele(udqp);
+                        ASSERT(error != ESRCH);
+                        ASSERT(error != ENOENT);
+                        return (error);
+                }
+                ASSERT(gdqp);
+                /* Reacquire the locks in the right order */
+                if (udqp) {
+                        if (! xfs_qm_dqlock_nowait(udqp)) {
+                                xfs_dqunlock(gdqp);
+                                xfs_dqlock(udqp);
+                                xfs_dqlock(gdqp);
+                        }
+                }
+        }
+        *O_udqpp = udqp;
+        *O_gdqpp = gdqp;
+#ifdef QUOTADEBUG
+        if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
+        if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
+#endif
+        return (0);
+}
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+        xfs_mount_t     *mp,
+        xfs_inode_t     **ip,
+        __int64_t       sbfields,
+        uint            flags)
+{
+        xfs_trans_t     *tp;
+        int             error;
+        unsigned long s;
+        cred_t          zerocr;
+        int             committed;
+        tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE);
+        if ((error = xfs_trans_reserve(tp,
+                                      XFS_QM_QINOCREATE_SPACE_RES(mp),
+                                      XFS_CREATE_LOG_RES(mp), 0,
+                                      XFS_TRANS_PERM_LOG_RES,
+                                      XFS_CREATE_LOG_COUNT))) {
+                xfs_trans_cancel(tp, 0);
+                return (error);
+        }
+        memset(&zerocr, 0, sizeof(zerocr));
+        if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, S_IFREG, 1, 0,
+                                   &zerocr, 0, 1, ip, &committed))) {
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                 XFS_TRANS_ABORT);
+                return (error);
+        }
+        /*
+         * Keep an extra reference to this quota inode. This inode is
+         * locked exclusively and joined to the transaction already.
+         */
+        ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip));
+        VN_HOLD(XFS_ITOV((*ip)));
+        /*
+         * Make the changes in the superblock, and log those too.
+         * sbfields arg may contain fields other than *QUOTINO;
+         * VERSIONNUM for example.
+         */
+        s = XFS_SB_LOCK(mp);
+        if (flags & XFS_QMOPT_SBVERSION) {
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+                unsigned oldv = mp->m_sb.sb_versionnum;
+#endif
+                ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
+                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                                   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
+                       (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                        XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+                XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
+                mp->m_sb.sb_uquotino = NULLFSINO;
+                mp->m_sb.sb_gquotino = NULLFSINO;
+                /* qflags will get updated _after_ quotacheck */
+                mp->m_sb.sb_qflags = 0;
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+                cmn_err(CE_NOTE,
+                        "Old superblock version %x, converting to %x.",
+                        oldv, mp->m_sb.sb_versionnum);
+#endif
+        }
+        if (flags & XFS_QMOPT_UQUOTA)
+                mp->m_sb.sb_uquotino = (*ip)->i_ino;
+        else
+                mp->m_sb.sb_gquotino = (*ip)->i_ino;
+        XFS_SB_UNLOCK(mp, s);
+        xfs_mod_sb(tp, sbfields);
+        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+                                     NULL))) {
+                xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+                return (error);
+        }
+        return (0);
+}
+STATIC int
+xfs_qm_reset_dqcounts(
+        xfs_mount_t     *mp,
+        xfs_buf_t       *bp,
+        xfs_dqid_t      id,
+        uint            type)
+{
+        xfs_disk_dquot_t        *ddq;
+        int                     j;
+        xfs_buftrace("RESET DQUOTS", bp);
+        /*
+         * Reset all counters and timers. They'll be
+         * started afresh by xfs_qm_quotacheck.
+         */
+#ifdef DEBUG
+        j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+        do_div(j, sizeof(xfs_dqblk_t));
+        ASSERT(XFS_QM_DQPERBLK(mp) == j);
+#endif
+        ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
+        for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+                /*
+                 * Do a sanity check, and if needed, repair the dqblk. Don't
+                 * output any warnings because it's perfectly possible to
+                 * find unitialized dquot blks. See comment in xfs_qm_dqcheck.
+                 */
+                (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+                                      "xfs_quotacheck");
+                INT_SET(ddq->d_bcount, ARCH_CONVERT, 0ULL);
+                INT_SET(ddq->d_icount, ARCH_CONVERT, 0ULL);
+                INT_SET(ddq->d_rtbcount, ARCH_CONVERT, 0ULL);
+                INT_SET(ddq->d_btimer, ARCH_CONVERT, (time_t)0);
+                INT_SET(ddq->d_itimer, ARCH_CONVERT, (time_t)0);
+                INT_SET(ddq->d_bwarns, ARCH_CONVERT, 0UL);
+                INT_SET(ddq->d_iwarns, ARCH_CONVERT, 0UL);
+                ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+        }
+        return (0);
+}
+STATIC int
+xfs_qm_dqiter_bufs(
+        xfs_mount_t     *mp,
+        xfs_dqid_t      firstid,
+        xfs_fsblock_t   bno,
+        xfs_filblks_t   blkcnt,
+        uint            flags)
+{
+        xfs_buf_t       *bp;
+        int             error;
+        int             notcommitted;
+        int             incr;
+        ASSERT(blkcnt > 0);
+        notcommitted = 0;
+        incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
+                XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
+        error = 0;
+        /*
+         * Blkcnt arg can be a very big number, and might even be
+         * larger than the log itself. So, we have to break it up into
+         * manageable-sized transactions.
+         * Note that we don't start a permanent transaction here; we might
+         * not be able to get a log reservation for the whole thing up front,
+         * and we don't really care to either, because we just discard
+         * everything if we were to crash in the middle of this loop.
+         */
+        while (blkcnt--) {
+                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+                              XFS_FSB_TO_DADDR(mp, bno),
+                              (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+                if (error)
+                        break;
+                (void) xfs_qm_reset_dqcounts(mp, bp, firstid,
+                                             flags & XFS_QMOPT_UQUOTA ?
+                                             XFS_DQ_USER : XFS_DQ_GROUP);
+                xfs_bdwrite(mp, bp);
+                /*
+                 * goto the next block.
+                 */
+                bno++;
+                firstid += XFS_QM_DQPERBLK(mp);
+        }
+        return (error);
+}
+/*
+ * Iterate over all allocated USR/GRP dquots in the system, calling a
+ * caller supplied function for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_dqiterate(
+        xfs_mount_t     *mp,
+        xfs_inode_t     *qip,
+        uint            flags)
+{
+        xfs_bmbt_irec_t         *map;
+        int                     i, nmaps;       /* number of map entries */
+        int                     error;          /* return value */
+        xfs_fileoff_t           lblkno;
+        xfs_filblks_t           maxlblkcnt;
+        xfs_dqid_t              firstid;
+        xfs_fsblock_t           rablkno;
+        xfs_filblks_t           rablkcnt;
+        error = 0;
+        /*
+         * This looks racey, but we can't keep an inode lock across a
+         * trans_reserve. But, this gets called during quotacheck, and that
+         * happens only at mount time which is single threaded.
+         */
+        if (qip->i_d.di_nblocks == 0)
+                return (0);
+        map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+        lblkno = 0;
+        maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+        do {
+                nmaps = XFS_DQITER_MAP_SIZE;
+                /*
+                 * We aren't changing the inode itself. Just changing
+                 * some of its data. No new blocks are added here, and
+                 * the inode is never added to the transaction.
+                 */
+                xfs_ilock(qip, XFS_ILOCK_SHARED);
+                error = xfs_bmapi(NULL, qip, lblkno,
+                                  maxlblkcnt - lblkno,
+                                  XFS_BMAPI_METADATA,
+                                  NULL,
+                                  0, map, &nmaps, NULL);
+                xfs_iunlock(qip, XFS_ILOCK_SHARED);
+                if (error)
+                        break;
+                ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+                for (i = 0; i < nmaps; i++) {
+                        ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+                        ASSERT(map[i].br_blockcount);
+                        lblkno += map[i].br_blockcount;
+                        if (map[i].br_startblock == HOLESTARTBLOCK)
+                                continue;
+                        firstid = (xfs_dqid_t) map[i].br_startoff *
+                                XFS_QM_DQPERBLK(mp);
+                        /*
+                         * Do a read-ahead on the next extent.
+                         */
+                        if ((i+1 < nmaps) &&
+                            (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+                                rablkcnt =  map[i+1].br_blockcount;
+                                rablkno = map[i+1].br_startblock;
+                                while (rablkcnt--) {
+                                        xfs_baread(mp->m_ddev_targp,
+                                               XFS_FSB_TO_DADDR(mp, rablkno),
+                                               (int)XFS_QI_DQCHUNKLEN(mp));
+                                        rablkno++;
+                                }
+                        }
+                        /*
+                         * Iterate thru all the blks in the extent and
+                         * reset the counters of all the dquots inside them.
+                         */
+                        if ((error = xfs_qm_dqiter_bufs(mp,
+                                                       firstid,
+                                                       map[i].br_startblock,
+                                                       map[i].br_blockcount,
+                                                       flags))) {
+                                break;
+                        }
+                }
+                if (error)
+                        break;
+        } while (nmaps > 0);
+        kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
+        return (error);
+}
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ * Given the inode, and a dquot (either USR or GRP, doesn't matter),
+ * this updates its incore copy as well as the buffer copy. This is
+ * so that once the quotacheck is done, we can just log all the buffers,
+ * as opposed to logging numerous updates to individual dquots.
+ */
+STATIC void
+xfs_qm_quotacheck_dqadjust(
+        xfs_dquot_t             *dqp,
+        xfs_qcnt_t              nblks,
+        xfs_qcnt_t              rtblks)
+{
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
+        /*
+         * Adjust the inode count and the block count to reflect this inode's
+         * resource usage.
+         */
+        INT_MOD(dqp->q_core.d_icount, ARCH_CONVERT, +1);
+        dqp->q_res_icount++;
+        if (nblks) {
+                INT_MOD(dqp->q_core.d_bcount, ARCH_CONVERT, nblks);
+                dqp->q_res_bcount += nblks;
+        }
+        if (rtblks) {
+                INT_MOD(dqp->q_core.d_rtbcount, ARCH_CONVERT, rtblks);
+                dqp->q_res_rtbcount += rtblks;
+        }
+        /*
+         * Set default limits, adjust timers (since we changed usages)
+         */
+        if (! XFS_IS_SUSER_DQUOT(dqp)) {
+                xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core);
+                xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+        }
+        dqp->dq_flags |= XFS_DQ_DIRTY;
+}
+STATIC int
+xfs_qm_get_rtblks(
+        xfs_inode_t     *ip,
+        xfs_qcnt_t      *O_rtblks)
+{
+        xfs_filblks_t   rtblks;                 /* total rt blks */
+        xfs_ifork_t     *ifp;                   /* inode fork pointer */
+        xfs_extnum_t    nextents;               /* number of extent entries */
+        xfs_bmbt_rec_t  *base;                  /* base of extent array */
+        xfs_bmbt_rec_t  *ep;                    /* pointer to an extent entry */
+        int             error;
+        ASSERT(XFS_IS_REALTIME_INODE(ip));
+        ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+                        return (error);
+        }
+        rtblks = 0;
+        nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        base = &ifp->if_u1.if_extents[0];
+        for (ep = base; ep < &base[nextents]; ep++)
+                rtblks += xfs_bmbt_get_blockcount(ep);
+        *O_rtblks = (xfs_qcnt_t)rtblks;
+        return (0);
+}
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        void            __user *buffer, /* not used */
+        int             ubsize,         /* not used */
+        void            *private_data,  /* not used */
+        xfs_daddr_t     bno,            /* starting block of inode cluster */
+        int             *ubused,        /* not used */
+        void            *dip,           /* on-disk inode pointer (not used) */
+        int             *res)           /* result code value */
+{
+        xfs_inode_t     *ip;
+        xfs_dquot_t     *udqp, *gdqp;
+        xfs_qcnt_t      nblks, rtblks;
+        int             error;
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        /*
+         * rootino must have its resources accounted for, not so with the quota
+         * inodes.
+         */
+        if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+                *res = BULKSTAT_RV_NOTHING;
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
+         * interface expects the inode to be exclusively locked because that's
+         * the case in all other instances. It's OK that we do this because
+         * quotacheck is done only at mount time.
+         */
+        if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) {
+                *res = BULKSTAT_RV_NOTHING;
+                return (error);
+        }
+        if (ip->i_d.di_mode == 0) {
+                xfs_iput_new(ip, XFS_ILOCK_EXCL);
+                *res = BULKSTAT_RV_NOTHING;
+                return XFS_ERROR(ENOENT);
+        }
+        /*
+         * Obtain the locked dquots. In case of an error (eg. allocation
+         * fails for ENOSPC), we return the negative of the error number
+         * to bulkstat, so that it can get propagated to quotacheck() and
+         * making us disable quotas for the file system.
+         */
+        if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
+                xfs_iput(ip, XFS_ILOCK_EXCL);
+                *res = BULKSTAT_RV_GIVEUP;
+                return (error);
+        }
+        rtblks = 0;
+        if (! XFS_IS_REALTIME_INODE(ip)) {
+                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
+        } else {
+                /*
+                 * Walk thru the extent list and count the realtime blocks.
+                 */
+                if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
+                        xfs_iput(ip, XFS_ILOCK_EXCL);
+                        if (udqp)
+                                xfs_qm_dqput(udqp);
+                        if (gdqp)
+                                xfs_qm_dqput(gdqp);
+                        *res = BULKSTAT_RV_GIVEUP;
+                        return (error);
+                }
+                nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+        }
+        ASSERT(ip->i_delayed_blks == 0);
+        /*
+         * We can't release the inode while holding its dquot locks.
+         * The inode can go into inactive and might try to acquire the dquotlocks.
+         * So, just unlock here and do a vn_rele at the end.
+         */
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        /*
+         * Add the (disk blocks and inode) resources occupied by this
+         * inode to its dquots. We do this adjustment in the incore dquot,
+         * and also copy the changes to its buffer.
+         * We don't care about putting these changes in a transaction
+         * envelope because if we crash in the middle of a 'quotacheck'
+         * we have to start from the beginning anyway.
+         * Once we're done, we'll log all the dquot bufs.
+         *
+         * The *QUOTA_ON checks below may look pretty racey, but quotachecks
+         * and quotaoffs don't race. (Quotachecks happen at mount time only).
+         */
+        if (XFS_IS_UQUOTA_ON(mp)) {
+                ASSERT(udqp);
+                xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
+                xfs_qm_dqput(udqp);
+        }
+        if (XFS_IS_GQUOTA_ON(mp)) {
+                ASSERT(gdqp);
+                xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
+                xfs_qm_dqput(gdqp);
+        }
+        /*
+         * Now release the inode. This will send it to 'inactive', and
+         * possibly even free blocks.
+         */
+        VN_RELE(XFS_ITOV(ip));
+        /*
+         * Goto next inode.
+         */
+        *res = BULKSTAT_RV_DIDONE;
+        return (0);
+}
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world. If the quotacheck fails, disable quotas.
+ */
+int
+xfs_qm_quotacheck(
+        xfs_mount_t     *mp)
+{
+        int             done, count, error;
+        xfs_ino_t       lastino;
+        size_t          structsz;
+        xfs_inode_t     *uip, *gip;
+        uint            flags;
+        count = INT_MAX;
+        structsz = 1;
+        lastino = 0;
+        flags = 0;
+        ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        /*
+         * There should be no cached dquots. The (simplistic) quotacheck
+         * algorithm doesn't like that.
+         */
+        ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+        /*
+         * First we go thru all the dquots on disk, USR and GRP, and reset
+         * their counters to zero. We need a clean slate.
+         * We don't log our changes till later.
+         */
+        if ((uip = XFS_QI_UQIP(mp))) {
+                if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+                        goto error_return;
+                flags |= XFS_UQUOTA_CHKD;
+        }
+        if ((gip = XFS_QI_GQIP(mp))) {
+                if ((error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA)))
+                        goto error_return;
+                flags |= XFS_GQUOTA_CHKD;
+        }
+        do {
+                /*
+                 * Iterate thru all the inodes in the file system,
+                 * adjusting the corresponding dquot counters in core.
+                 */
+                if ((error = xfs_bulkstat(mp, &lastino, &count,
+                                     xfs_qm_dqusage_adjust, NULL,
+                                     structsz, NULL,
+                                     BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
+                                     &done)))
+                        break;
+        } while (! done);
+        /*
+         * We can get this error if we couldn't do a dquot allocation inside
+         * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+         * dirty dquots that might be cached, we just want to get rid of them
+         * and turn quotaoff. The dquots won't be attached to any of the inodes
+         * at this point (because we intentionally didn't in dqget_noattach).
+         */
+        if (error) {
+                xfs_qm_dqpurge_all(mp,
+                                   XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA|
+                                   XFS_QMOPT_QUOTAOFF);
+                goto error_return;
+        }
+        /*
+         * We've made all the changes that we need to make incore.
+         * Now flush_them down to disk buffers.
+         */
+        xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+        /*
+         * We didn't log anything, because if we crashed, we'll have to
+         * start the quotacheck from scratch anyway. However, we must make
+         * sure that our dquot changes are secure before we put the
+         * quotacheck'd stamp on the superblock. So, here we do a synchronous
+         * flush.
+         */
+        XFS_bflush(mp->m_ddev_targp);
+        /*
+         * If one type of quotas is off, then it will lose its
+         * quotachecked status, since we won't be doing accounting for
+         * that type anymore.
+         */
+        mp->m_qflags &= ~(XFS_GQUOTA_CHKD | XFS_UQUOTA_CHKD);
+        mp->m_qflags |= flags;
+        XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+ error_return:
+        if (error) {
+                cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
+                        "Disabling quotas.",
+                        mp->m_fsname, error);
+                /*
+                 * We must turn off quotas.
+                 */
+                ASSERT(mp->m_quotainfo != NULL);
+                ASSERT(xfs_Gqm != NULL);
+                xfs_qm_destroy_quotainfo(mp);
+                xfs_mount_reset_sbqflags(mp);
+        } else {
+                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
+        }
+        return (error);
+}
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+        xfs_mount_t     *mp)
+{
+        xfs_inode_t     *uip, *gip;
+        int             error;
+        __int64_t       sbflags;
+        uint            flags;
+        ASSERT(mp->m_quotainfo);
+        uip = gip = NULL;
+        sbflags = 0;
+        flags = 0;
+        /*
+         * Get the uquota and gquota inodes
+         */
+        if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+                if (XFS_IS_UQUOTA_ON(mp) &&
+                    mp->m_sb.sb_uquotino != NULLFSINO) {
+                        ASSERT(mp->m_sb.sb_uquotino > 0);
+                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                                             0, 0, &uip, 0)))
+                                return XFS_ERROR(error);
+                }
+                if (XFS_IS_GQUOTA_ON(mp) &&
+                    mp->m_sb.sb_gquotino != NULLFSINO) {
+                        ASSERT(mp->m_sb.sb_gquotino > 0);
+                        if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                             0, 0, &gip, 0))) {
+                                if (uip)
+                                        VN_RELE(XFS_ITOV(uip));
+                                return XFS_ERROR(error);
+                        }
+                }
+        } else {
+                flags |= XFS_QMOPT_SBVERSION;
+                sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                            XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+        }
+        /*
+         * Create the two inodes, if they don't exist already. The changes
+         * made above will get added to a transaction and logged in one of
+         * the qino_alloc calls below.  If the device is readonly,
+         * temporarily switch to read-write to do this.
+         */
+        if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+                if ((error = xfs_qm_qino_alloc(mp, &uip,
+                                              sbflags | XFS_SB_UQUOTINO,
+                                              flags | XFS_QMOPT_UQUOTA)))
+                        return XFS_ERROR(error);
+                flags &= ~XFS_QMOPT_SBVERSION;
+        }
+        if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
+                if ((error = xfs_qm_qino_alloc(mp, &gip,
+                                              sbflags | XFS_SB_GQUOTINO,
+                                              flags | XFS_QMOPT_GQUOTA))) {
+                        if (uip)
+                                VN_RELE(XFS_ITOV(uip));
+                        return XFS_ERROR(error);
+                }
+        }
+        XFS_QI_UQIP(mp) = uip;
+        XFS_QI_GQIP(mp) = gip;
+        return (0);
+}
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ * XXXsup merge this with qm_reclaim_one().
+ */
+STATIC int
+xfs_qm_shake_freelist(
+        int howmany)
+{
+        int             nreclaimed;
+        xfs_dqhash_t    *hash;
+        xfs_dquot_t     *dqp, *nextdqp;
+        int             restarts;
+        int             nflushes;
+        if (howmany <= 0)
+                return (0);
+        nreclaimed = 0;
+        restarts = 0;
+        nflushes = 0;
+#ifdef QUOTADEBUG
+        cmn_err(CE_DEBUG, "Shake free 0x%x", howmany);
+#endif
+        /* lock order is : hashchainlock, freelistlock, mplistlock */
+ tryagain:
+        xfs_qm_freelist_lock(xfs_Gqm);
+        for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+             ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
+              nreclaimed < howmany); ) {
+                xfs_dqlock(dqp);
+                /*
+                 * We are racing with dqlookup here. Naturally we don't
+                 * want to reclaim a dquot that lookup wants.
+                 */
+                if (dqp->dq_flags & XFS_DQ_WANT) {
+                        xfs_dqunlock(dqp);
+                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                                return (nreclaimed);
+                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
+                        goto tryagain;
+                }
+                /*
+                 * If the dquot is inactive, we are assured that it is
+                 * not on the mplist or the hashlist, and that makes our
+                 * life easier.
+                 */
+                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+                        ASSERT(dqp->q_mount == NULL);
+                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(dqp->MPL_PREVP == NULL);
+                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
+                        nextdqp = dqp->dq_flnext;
+                        goto off_freelist;
+                }
+                ASSERT(dqp->MPL_PREVP);
+                /*
+                 * Try to grab the flush lock. If this dquot is in the process of
+                 * getting flushed to disk, we don't want to reclaim it.
+                 */
+                if (! xfs_qm_dqflock_nowait(dqp)) {
+                        xfs_dqunlock(dqp);
+                        dqp = dqp->dq_flnext;
+                        continue;
+                }
+                /*
+                 * We have the flush lock so we know that this is not in the
+                 * process of being flushed. So, if this is dirty, flush it
+                 * DELWRI so that we don't get a freelist infested with
+                 * dirty dquots.
+                 */
+                if (XFS_DQ_IS_DIRTY(dqp)) {
+                        xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
+                        /*
+                         * We flush it delayed write, so don't bother
+                         * releasing the mplock.
+                         */
+                        (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+                        dqp = dqp->dq_flnext;
+                        continue;
+                }
+                /*
+                 * We're trying to get the hashlock out of order. This races
+                 * with dqlookup; so, we giveup and goto the next dquot if
+                 * we couldn't get the hashlock. This way, we won't starve
+                 * a dqlookup process that holds the hashlock that is
+                 * waiting for the freelist lock.
+                 */
+                if (! xfs_qm_dqhashlock_nowait(dqp)) {
+                        xfs_dqfunlock(dqp);
+                        xfs_dqunlock(dqp);
+                        dqp = dqp->dq_flnext;
+                        continue;
+                }
+                /*
+                 * This races with dquot allocation code as well as dqflush_all
+                 * and reclaim code. So, if we failed to grab the mplist lock,
+                 * giveup everything and start over.
+                 */
+                hash = dqp->q_hash;
+                ASSERT(hash);
+                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+                        /* XXX put a sentinel so that we can come back here */
+                        xfs_dqfunlock(dqp);
+                        xfs_dqunlock(dqp);
+                        XFS_DQ_HASH_UNLOCK(hash);
+                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                                return (nreclaimed);
+                        goto tryagain;
+                }
+                xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
+#ifdef QUOTADEBUG
+                cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n",
+                        dqp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+#endif
+                ASSERT(dqp->q_nrefs == 0);
+                nextdqp = dqp->dq_flnext;
+                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+                XQM_HASHLIST_REMOVE(hash, dqp);
+                xfs_dqfunlock(dqp);
+                xfs_qm_mplist_unlock(dqp->q_mount);
+                XFS_DQ_HASH_UNLOCK(hash);
+ off_freelist:
+                XQM_FREELIST_REMOVE(dqp);
+                xfs_dqunlock(dqp);
+                nreclaimed++;
+                XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims);
+                xfs_qm_dqdestroy(dqp);
+                dqp = nextdqp;
+        }
+        xfs_qm_freelist_unlock(xfs_Gqm);
+        return (nreclaimed);
+}
+/*
+ * The kmem_shake interface is invoked when memory is running low.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_shake(int nr_to_scan, unsigned int gfp_mask)
+{
+        int     ndqused, nfree, n;
+        if (!kmem_shake_allow(gfp_mask))
+                return (0);
+        if (!xfs_Gqm)
+                return (0);
+        nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+        /* incore dquots in all f/s's */
+        ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
+        ASSERT(ndqused >= 0);
+        if (nfree <= ndqused && nfree < ndquot)
+                return (0);
+        ndqused *= xfs_Gqm->qm_dqfree_ratio;    /* target # of free dquots */
+        n = nfree - ndqused - ndquot;           /* # over target */
+        return xfs_qm_shake_freelist(MAX(nfree, n));
+}
+/*
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
+{
+        xfs_dquot_t     *dqpout;
+        xfs_dquot_t     *dqp;
+        int             restarts;
+        int             nflushes;
+        restarts = 0;
+        dqpout = NULL;
+        nflushes = 0;
+        /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+ startagain:
+        xfs_qm_freelist_lock(xfs_Gqm);
+        FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
+                xfs_dqlock(dqp);
+                /*
+                 * We are racing with dqlookup here. Naturally we don't
+                 * want to reclaim a dquot that lookup wants. We release the
+                 * freelist lock and start over, so that lookup will grab
+                 * both the dquot and the freelistlock.
+                 */
+                if (dqp->dq_flags & XFS_DQ_WANT) {
+                        ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+                        xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
+                        xfs_dqunlock(dqp);
+                        xfs_qm_freelist_unlock(xfs_Gqm);
+                        if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                                return (NULL);
+                        XQM_STATS_INC(xqmstats.xs_qm_dqwants);
+                        goto startagain;
+                }
+                /*
+                 * If the dquot is inactive, we are assured that it is
+                 * not on the mplist or the hashlist, and that makes our
+                 * life easier.
+                 */
+                if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+                        ASSERT(dqp->q_mount == NULL);
+                        ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+                        ASSERT(dqp->HL_PREVP == NULL);
+                        ASSERT(dqp->MPL_PREVP == NULL);
+                        XQM_FREELIST_REMOVE(dqp);
+                        xfs_dqunlock(dqp);
+                        dqpout = dqp;
+                        XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
+                        break;
+                }
+                ASSERT(dqp->q_hash);
+                ASSERT(dqp->MPL_PREVP);
+                /*
+                 * Try to grab the flush lock. If this dquot is in the process of
+                 * getting flushed to disk, we don't want to reclaim it.
+                 */
+                if (! xfs_qm_dqflock_nowait(dqp)) {
+                        xfs_dqunlock(dqp);
+                        continue;
+                }
+                /*
+                 * We have the flush lock so we know that this is not in the
+                 * process of being flushed. So, if this is dirty, flush it
+                 * DELWRI so that we don't get a freelist infested with
+                 * dirty dquots.
+                 */
+                if (XFS_DQ_IS_DIRTY(dqp)) {
+                        xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
+                        /*
+                         * We flush it delayed write, so don't bother
+                         * releasing the freelist lock.
+                         */
+                        (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+                        xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+                        continue;
+                }
+                if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+                        xfs_dqfunlock(dqp);
+                        xfs_dqunlock(dqp);
+                        continue;
+                }
+                if (! xfs_qm_dqhashlock_nowait(dqp))
+                        goto mplistunlock;
+                ASSERT(dqp->q_nrefs == 0);
+                xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
+                XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+                XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
+                XQM_FREELIST_REMOVE(dqp);
+                dqpout = dqp;
+                XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+ mplistunlock:
+                xfs_qm_mplist_unlock(dqp->q_mount);
+                xfs_dqfunlock(dqp);
+                xfs_dqunlock(dqp);
+                if (dqpout)
+                        break;
+        }
+        xfs_qm_freelist_unlock(xfs_Gqm);
+        return (dqpout);
+}
+/*------------------------------------------------------------------*/
+/*
+ * Return a new incore dquot. Depending on the number of
+ * dquots in the system, we either allocate a new one on the kernel heap,
+ * or reclaim a free one.
+ * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
+ * to reclaim an existing one from the freelist.
+ */
+boolean_t
+xfs_qm_dqalloc_incore(
+        xfs_dquot_t **O_dqpp)
+{
+        xfs_dquot_t     *dqp;
+        /*
+         * Check against high water mark to see if we want to pop
+         * a nincompoop dquot off the freelist.
+         */
+        if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
+                /*
+                 * Try to recycle a dquot from the freelist.
+                 */
+                if ((dqp = xfs_qm_dqreclaim_one())) {
+                        XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+                        /*
+                         * Just zero the core here. The rest will get
+                         * reinitialized by caller. XXX we shouldn't even
+                         * do this zero ...
+                         */
+                        memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+                        *O_dqpp = dqp;
+                        return (B_FALSE);
+                }
+                XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+        }
+        /*
+         * Allocate a brand new dquot on the kernel heap and return it
+         * to the caller to initialize.
+         */
+        ASSERT(xfs_Gqm->qm_dqzone != NULL);
+        *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+        atomic_inc(&xfs_Gqm->qm_totaldquots);
+        return (B_TRUE);
+}
+/*
+ * Start a transaction and write the incore superblock changes to
+ * disk. flags parameter indicates which fields have changed.
+ */
+int
+xfs_qm_write_sb_changes(
+        xfs_mount_t     *mp,
+        __int64_t       flags)
+{
+        xfs_trans_t     *tp;
+        int             error;
+#ifdef QUOTADEBUG
+        cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
+#endif
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        if ((error = xfs_trans_reserve(tp, 0,
+                                      mp->m_sb.sb_sectsize + 128, 0,
+                                      0,
+                                      XFS_DEFAULT_LOG_COUNT))) {
+                xfs_trans_cancel(tp, 0);
+                return (error);
+        }
+        xfs_mod_sb(tp, flags);
+        (void) xfs_trans_commit(tp, 0, NULL);
+        return (0);
+}
+/* --------------- utility functions for vnodeops ---------------- */
+/*
+ * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in   : inode (unlocked)
+ * out  : udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+        xfs_mount_t     *mp,
+        xfs_inode_t     *ip,
+        uid_t           uid,
+        gid_t           gid,
+        uint            flags,
+        xfs_dquot_t     **O_udqpp,
+        xfs_dquot_t     **O_gdqpp)
+{
+        int             error;
+        xfs_dquot_t     *uq, *gq;
+        uint            lockflags;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return 0;
+        lockflags = XFS_ILOCK_EXCL;
+        xfs_ilock(ip, lockflags);
+        if ((flags & XFS_QMOPT_INHERIT) &&
+            XFS_INHERIT_GID(ip, XFS_MTOVFS(mp)))
+                gid = ip->i_d.di_gid;
+        /*
+         * Attach the dquot(s) to this inode, doing a dquot allocation
+         * if necessary. The dquot(s) will not be locked.
+         */
+        if (XFS_NOT_DQATTACHED(mp, ip)) {
+                if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
+                                            XFS_QMOPT_ILOCKED))) {
+                        xfs_iunlock(ip, lockflags);
+                        return (error);
+                }
+        }
+        uq = gq = NULL;
+        if ((flags & XFS_QMOPT_UQUOTA) &&
+            XFS_IS_UQUOTA_ON(mp)) {
+                if (ip->i_d.di_uid != uid) {
+                        /*
+                         * What we need is the dquot that has this uid, and
+                         * if we send the inode to dqget, the uid of the inode
+                         * takes priority over what's sent in the uid argument.
+                         * We must unlock inode here before calling dqget if
+                         * we're not sending the inode, because otherwise
+                         * we'll deadlock by doing trans_reserve while
+                         * holding ilock.
+                         */
+                        xfs_iunlock(ip, lockflags);
+                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+                                                 XFS_DQ_USER,
+                                                 XFS_QMOPT_DQALLOC |
+                                                 XFS_QMOPT_DOWARN,
+                                                 &uq))) {
+                                ASSERT(error != ENOENT);
+                                return (error);
+                        }
+                        /*
+                         * Get the ilock in the right order.
+                         */
+                        xfs_dqunlock(uq);
+                        lockflags = XFS_ILOCK_SHARED;
+                        xfs_ilock(ip, lockflags);
+                } else {
+                        /*
+                         * Take an extra reference, because we'll return
+                         * this to caller
+                         */
+                        ASSERT(ip->i_udquot);
+                        uq = ip->i_udquot;
+                        xfs_dqlock(uq);
+                        XFS_DQHOLD(uq);
+                        xfs_dqunlock(uq);
+                }
+        }
+        if ((flags & XFS_QMOPT_GQUOTA) &&
+            XFS_IS_GQUOTA_ON(mp)) {
+                if (ip->i_d.di_gid != gid) {
+                        xfs_iunlock(ip, lockflags);
+                        if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+                                                 XFS_DQ_GROUP,
+                                                 XFS_QMOPT_DQALLOC |
+                                                 XFS_QMOPT_DOWARN,
+                                                 &gq))) {
+                                if (uq)
+                                        xfs_qm_dqrele(uq);
+                                ASSERT(error != ENOENT);
+                                return (error);
+                        }
+                        xfs_dqunlock(gq);
+                        lockflags = XFS_ILOCK_SHARED;
+                        xfs_ilock(ip, lockflags);
+                } else {
+                        ASSERT(ip->i_gdquot);
+                        gq = ip->i_gdquot;
+                        xfs_dqlock(gq);
+                        XFS_DQHOLD(gq);
+                        xfs_dqunlock(gq);
+                }
+        }
+        if (uq)
+                xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
+        xfs_iunlock(ip, lockflags);
+        if (O_udqpp)
+                *O_udqpp = uq;
+        else if (uq)
+                xfs_qm_dqrele(uq);
+        if (O_gdqpp)
+                *O_gdqpp = gq;
+        else if (gq)
+                xfs_qm_dqrele(gq);
+        return (0);
+}
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+xfs_dquot_t *
+xfs_qm_vop_chown(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        xfs_dquot_t     **IO_olddq,
+        xfs_dquot_t     *newdq)
+{
+        xfs_dquot_t     *prevdq;
+        ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+        ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+        /* old dquot */
+        prevdq = *IO_olddq;
+        ASSERT(prevdq);
+        ASSERT(prevdq != newdq);
+        xfs_trans_mod_dquot(tp, prevdq,
+                            XFS_TRANS_DQ_BCOUNT,
+                            -(ip->i_d.di_nblocks));
+        xfs_trans_mod_dquot(tp, prevdq,
+                            XFS_TRANS_DQ_ICOUNT,
+                            -1);
+        /* the sparkling new dquot */
+        xfs_trans_mod_dquot(tp, newdq,
+                            XFS_TRANS_DQ_BCOUNT,
+                            ip->i_d.di_nblocks);
+        xfs_trans_mod_dquot(tp, newdq,
+                            XFS_TRANS_DQ_ICOUNT,
+                            1);
+        /*
+         * Take an extra reference, because the inode
+         * is going to keep this dquot pointer even
+         * after the trans_commit.
+         */
+        xfs_dqlock(newdq);
+        XFS_DQHOLD(newdq);
+        xfs_dqunlock(newdq);
+        *IO_olddq = newdq;
+        return (prevdq);
+}
+/*
+ * Quota reservations for setattr(AT_UID|AT_GID).
+ */
+int
+xfs_qm_vop_chown_reserve(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        xfs_dquot_t     *udqp,
+        xfs_dquot_t     *gdqp,
+        uint            flags)
+{
+        int             error;
+        xfs_mount_t     *mp;
+        uint            delblks;
+        xfs_dquot_t     *unresudq, *unresgdq, *delblksudq, *delblksgdq;
+        ASSERT(XFS_ISLOCKED_INODE(ip));
+        mp = ip->i_mount;
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        delblks = ip->i_delayed_blks;
+        delblksudq = delblksgdq = unresudq = unresgdq = NULL;
+        if (XFS_IS_UQUOTA_ON(mp) && udqp &&
+            ip->i_d.di_uid != (uid_t)INT_GET(udqp->q_core.d_id, ARCH_CONVERT)) {
+                delblksudq = udqp;
+                /*
+                 * If there are delayed allocation blocks, then we have to
+                 * unreserve those from the old dquot, and add them to the
+                 * new dquot.
+                 */
+                if (delblks) {
+                        ASSERT(ip->i_udquot);
+                        unresudq = ip->i_udquot;
+                }
+        }
+        if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
+            ip->i_d.di_gid != INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)) {
+                delblksgdq = gdqp;
+                if (delblks) {
+                        ASSERT(ip->i_gdquot);
+                        unresgdq = ip->i_gdquot;
+                }
+        }
+        if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+                                delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
+                                flags | XFS_QMOPT_RES_REGBLKS)))
+                return (error);
+        /*
+         * Do the delayed blks reservations/unreservations now. Since, these
+         * are done without the help of a transaction, if a reservation fails
+         * its previous reservations won't be automatically undone by trans
+         * code. So, we have to do it manually here.
+         */
+        if (delblks) {
+                /*
+                 * Do the reservations first. Unreservation can't fail.
+                 */
+                ASSERT(delblksudq || delblksgdq);
+                ASSERT(unresudq || unresgdq);
+                if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+                                delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
+                                flags | XFS_QMOPT_RES_REGBLKS)))
+                        return (error);
+                xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+                                unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+                                XFS_QMOPT_RES_REGBLKS);
+        }
+        return (0);
+}
+int
+xfs_qm_vop_rename_dqattach(
+        xfs_inode_t     **i_tab)
+{
+        xfs_inode_t     *ip;
+        int             i;
+        int             error;
+        ip = i_tab[0];
+        if (! XFS_IS_QUOTA_ON(ip->i_mount))
+                return (0);
+        if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+                error = xfs_qm_dqattach(ip, 0);
+                if (error)
+                        return (error);
+        }
+        for (i = 1; (i < 4 && i_tab[i]); i++) {
+                /*
+                 * Watch out for duplicate entries in the table.
+                 */
+                if ((ip = i_tab[i]) != i_tab[i-1]) {
+                        if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+                                error = xfs_qm_dqattach(ip, 0);
+                                if (error)
+                                        return (error);
+                        }
+                }
+        }
+        return (0);
+}
+void
+xfs_qm_vop_dqattach_and_dqmod_newinode(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        xfs_dquot_t     *udqp,
+        xfs_dquot_t     *gdqp)
+{
+        if (!XFS_IS_QUOTA_ON(tp->t_mountp))
+                return;
+        ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+        ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+        if (udqp) {
+                xfs_dqlock(udqp);
+                XFS_DQHOLD(udqp);
+                xfs_dqunlock(udqp);
+                ASSERT(ip->i_udquot == NULL);
+                ip->i_udquot = udqp;
+                ASSERT(ip->i_d.di_uid == INT_GET(udqp->q_core.d_id, ARCH_CONVERT));
+                xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+        }
+        if (gdqp) {
+                xfs_dqlock(gdqp);
+                XFS_DQHOLD(gdqp);
+                xfs_dqunlock(gdqp);
+                ASSERT(ip->i_gdquot == NULL);
+                ip->i_gdquot = gdqp;
+                ASSERT(ip->i_d.di_gid == INT_GET(gdqp->q_core.d_id, ARCH_CONVERT));
+                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+        }
+}
+/* ------------- list stuff -----------------*/
+void
+xfs_qm_freelist_init(xfs_frlist_t *ql)
+{
+        ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
+        mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf");
+        ql->qh_version = 0;
+        ql->qh_nelems = 0;
+}
+void
+xfs_qm_freelist_destroy(xfs_frlist_t *ql)
+{
+        xfs_dquot_t     *dqp, *nextdqp;
+        mutex_lock(&ql->qh_lock, PINOD);
+        for (dqp = ql->qh_next;
+             dqp != (xfs_dquot_t *)ql; ) {
+                xfs_dqlock(dqp);
+                nextdqp = dqp->dq_flnext;
+#ifdef QUOTADEBUG
+                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+#endif
+                XQM_FREELIST_REMOVE(dqp);
+                xfs_dqunlock(dqp);
+                xfs_qm_dqdestroy(dqp);
+                dqp = nextdqp;
+        }
+        /*
+         * Don't bother about unlocking.
+         */
+        mutex_destroy(&ql->qh_lock);
+        ASSERT(ql->qh_nelems == 0);
+}
+void
+xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
+{
+        dq->dq_flnext = ql->qh_next;
+        dq->dq_flprev = (xfs_dquot_t *)ql;
+        ql->qh_next = dq;
+        dq->dq_flnext->dq_flprev = dq;
+        xfs_Gqm->qm_dqfreelist.qh_nelems++;
+        xfs_Gqm->qm_dqfreelist.qh_version++;
+}
+void
+xfs_qm_freelist_unlink(xfs_dquot_t *dq)
+{
+        xfs_dquot_t *next = dq->dq_flnext;
+        xfs_dquot_t *prev = dq->dq_flprev;
+        next->dq_flprev = prev;
+        prev->dq_flnext = next;
+        dq->dq_flnext = dq->dq_flprev = dq;
+        xfs_Gqm->qm_dqfreelist.qh_nelems--;
+        xfs_Gqm->qm_dqfreelist.qh_version++;
+}
+void
+xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
+{
+        xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
+}
+int
+xfs_qm_dqhashlock_nowait(
+        xfs_dquot_t *dqp)
+{
+        int locked;
+        locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
+        return (locked);
+}
+int
+xfs_qm_freelist_lock_nowait(
+        xfs_qm_t *xqm)
+{
+        int locked;
+        locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
+        return (locked);
+}
+int
+xfs_qm_mplist_nowait(
+        xfs_mount_t     *mp)
+{
+        int locked;
+        ASSERT(mp->m_quotainfo);
+        locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
+        return (locked);
+}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
new file mode 100644
index 000000000000..dcf1a7a831d8
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_quota_priv.h"
+#include "xfs_qm_stats.h"
+struct xfs_qm;
+struct xfs_inode;
+extern mutex_t          xfs_Gqm_lock;
+extern struct xfs_qm    *xfs_Gqm;
+extern kmem_zone_t      *qm_dqzone;
+extern kmem_zone_t      *qm_dqtrxzone;
+/*
+ * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
+ * iterate over the mountpt's dquot list in one call.
+ */
+#define XFS_QM_SYNC_MAX_RESTARTS        7
+/*
+ * Ditto, for xfs_qm_dqreclaim_one.
+ */
+#define XFS_QM_RECLAIM_MAX_RESTARTS     4
+/*
+ * Ideal ratio of free to in use dquots. Quota manager makes an attempt
+ * to keep this balance.
+ */
+#define XFS_QM_DQFREE_RATIO             2
+/*
+ * Dquot hashtable constants/threshold values.
+ */
+#define XFS_QM_NCSIZE_THRESHOLD         5000
+#define XFS_QM_HASHSIZE_LOW             32
+#define XFS_QM_HASHSIZE_HIGH            64
+/*
+ * We output a cmn_err when quotachecking a quota file with more than
+ * this many fsbs.
+ */
+#define XFS_QM_BIG_QCHECK_NBLKS         500
+/*
+ * This defines the unit of allocation of dquots.
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ * XXXsup However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB      (xfs_filblks_t)1
+/*
+ * When doing a quotacheck, we log dquot clusters of this many FSBs at most
+ * in a single transaction. We don't want to ask for too huge a log reservation.
+ */
+#define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
+typedef xfs_dqhash_t    xfs_dqlist_t;
+/*
+ * The freelist head. The first two fields match the first two in the
+ * xfs_dquot_t structure (in xfs_dqmarker_t)
+ */
+typedef struct xfs_frlist {
+       struct xfs_dquot *qh_next;
+       struct xfs_dquot *qh_prev;
+       mutex_t           qh_lock;
+       uint              qh_version;
+       uint              qh_nelems;
+} xfs_frlist_t;
+/*
+ * Quota Manager (global) structure. Lives only in core.
+ */
+typedef struct xfs_qm {
+        xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
+        xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
+        uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
+        xfs_frlist_t     qm_dqfreelist;  /* freelist of dquots */
+        atomic_t         qm_totaldquots; /* total incore dquots */
+        uint             qm_nrefs;       /* file systems with quota on */
+        int              qm_dqfree_ratio;/* ratio of free to inuse dquots */
+        kmem_zone_t     *qm_dqzone;      /* dquot mem-alloc zone */
+        kmem_zone_t     *qm_dqtrxzone;   /* t_dqinfo of transactions */
+} xfs_qm_t;
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+typedef struct xfs_quotainfo {
+        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
+        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
+        lock_t           qi_pinlock;     /* dquot pinning mutex */
+        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
+        int              qi_dqreclaims;  /* a change here indicates
+                                            a removal in the dqlist */
+        time_t           qi_btimelimit;  /* limit for blks timer */
+        time_t           qi_itimelimit;  /* limit for inodes timer */
+        time_t           qi_rtbtimelimit;/* limit for rt blks timer */
+        xfs_qwarncnt_t   qi_bwarnlimit;  /* limit for num warnings */
+        xfs_qwarncnt_t   qi_iwarnlimit;  /* limit for num warnings */
+        mutex_t          qi_quotaofflock;/* to serialize quotaoff */
+        xfs_filblks_t    qi_dqchunklen;  /* # BBs in a chunk of dqs */
+        uint             qi_dqperchunk;  /* # ondisk dqs in above chunk */
+        xfs_qcnt_t       qi_bhardlimit;  /* default data blk hard limit */
+        xfs_qcnt_t       qi_bsoftlimit;  /* default data blk soft limit */
+        xfs_qcnt_t       qi_ihardlimit;  /* default inode count hard limit */
+        xfs_qcnt_t       qi_isoftlimit;  /* default inode count soft limit */
+        xfs_qcnt_t       qi_rtbhardlimit;/* default realtime blk hard limit */
+        xfs_qcnt_t       qi_rtbsoftlimit;/* default realtime blk soft limit */
+} xfs_quotainfo_t;
+extern xfs_dqtrxops_t   xfs_trans_dquot_ops;
+extern void     xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
+extern int      xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
+                        xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
+extern void     xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
+extern void     xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+/*
+ * We keep the usr and grp dquots separately so that locking will be easier
+ * to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+#define XFS_QM_TRANS_MAXDQS             2
+typedef struct xfs_dquot_acct {
+        xfs_dqtrx_t     dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
+        xfs_dqtrx_t     dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
+} xfs_dquot_acct_t;
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT       (7 * 24*60*60)          /* 1 week */
+#define XFS_QM_RTBTIMELIMIT     (7 * 24*60*60)          /* 1 week */
+#define XFS_QM_ITIMELIMIT       (7 * 24*60*60)          /* 1 week */
+#define XFS_QM_BWARNLIMIT       5
+#define XFS_QM_IWARNLIMIT       5
+#define XFS_QM_LOCK(xqm)        (mutex_lock(&xqm##_lock, PINOD))
+#define XFS_QM_UNLOCK(xqm)      (mutex_unlock(&xqm##_lock))
+#define XFS_QM_HOLD(xqm)        ((xqm)->qm_nrefs++)
+#define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
+extern void             xfs_mount_reset_sbqflags(xfs_mount_t *);
+extern int              xfs_qm_init_quotainfo(xfs_mount_t *);
+extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
+extern int              xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void             xfs_qm_mount_quotainit(xfs_mount_t *, uint);
+extern int              xfs_qm_quotacheck(xfs_mount_t *);
+extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
+extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
+extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+extern int              xfs_qm_sync(xfs_mount_t *, short);
+/* dquot stuff */
+extern boolean_t        xfs_qm_dqalloc_incore(xfs_dquot_t **);
+extern int              xfs_qm_dqattach(xfs_inode_t *, uint);
+extern void             xfs_qm_dqdetach(xfs_inode_t *);
+extern int              xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void             xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+/* vop stuff */
+extern int              xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
+                                        uid_t, gid_t, uint,
+                                        xfs_dquot_t **, xfs_dquot_t **);
+extern void             xfs_qm_vop_dqattach_and_dqmod_newinode(
+                                        xfs_trans_t *, xfs_inode_t *,
+                                        xfs_dquot_t *, xfs_dquot_t *);
+extern int              xfs_qm_vop_rename_dqattach(xfs_inode_t **);
+extern xfs_dquot_t *    xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
+                                        xfs_dquot_t **, xfs_dquot_t *);
+extern int              xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
+                                        xfs_dquot_t *, xfs_dquot_t *, uint);
+/* list stuff */
+extern void             xfs_qm_freelist_init(xfs_frlist_t *);
+extern void             xfs_qm_freelist_destroy(xfs_frlist_t *);
+extern void             xfs_qm_freelist_insert(xfs_frlist_t *, xfs_dquot_t *);
+extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
+extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
+extern int              xfs_qm_freelist_lock_nowait(xfs_qm_t *);
+extern int              xfs_qm_mplist_nowait(xfs_mount_t *);
+extern int              xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
+/* system call interface */
+extern int              xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t);
+#ifdef DEBUG
+extern int              xfs_qm_internalqcheck(xfs_mount_t *);
+#else
+#define xfs_qm_internalqcheck(mp)       (0)
+#endif
+#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
new file mode 100644
index 000000000000..be67d9c265f8
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_clnt.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+#define MNTOPT_QUOTA    "quota"         /* disk quotas (user) */
+#define MNTOPT_NOQUOTA  "noquota"       /* no quotas */
+#define MNTOPT_USRQUOTA "usrquota"      /* user quota enabled */
+#define MNTOPT_GRPQUOTA "grpquota"      /* group quota enabled */
+#define MNTOPT_UQUOTA   "uquota"        /* user quota (IRIX variant) */
+#define MNTOPT_GQUOTA   "gquota"        /* group quota (IRIX variant) */
+#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
+#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
+#define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
+STATIC int
+xfs_qm_parseargs(
+        struct bhv_desc         *bhv,
+        char                    *options,
+        struct xfs_mount_args   *args,
+        int                     update)
+{
+        size_t                  length;
+        char                    *local_options = options;
+        char                    *this_char;
+        int                     error;
+        int                     referenced = update;
+        while ((this_char = strsep(&local_options, ",")) != NULL) {
+                length = strlen(this_char);
+                if (local_options)
+                        length++;
+                if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+                        args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
+                        args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
+                        referenced = update;
+                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
+                           !strcmp(this_char, MNTOPT_UQUOTA) ||
+                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
+                        args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+                        referenced = 1;
+                } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
+                           !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+                        args->flags |= XFSMNT_UQUOTA;
+                        args->flags &= ~XFSMNT_UQUOTAENF;
+                        referenced = 1;
+                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
+                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+                        args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+                        referenced = 1;
+                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+                        args->flags |= XFSMNT_GQUOTA;
+                        args->flags &= ~XFSMNT_GQUOTAENF;
+                        referenced = 1;
+                } else {
+                        if (local_options)
+                                *(local_options-1) = ',';
+                        continue;
+                }
+                while (length--)
+                        *this_char++ = ',';
+        }
+        PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error);
+        if (!error && !referenced)
+                bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM);
+        return error;
+}
+STATIC int
+xfs_qm_showargs(
+        struct bhv_desc         *bhv,
+        struct seq_file         *m)
+{
+        struct vfs              *vfsp = bhvtovfs(bhv);
+        struct xfs_mount        *mp = XFS_VFSTOM(vfsp);
+        int                     error;
+        if (mp->m_qflags & XFS_UQUOTA_ACCT) {
+                (mp->m_qflags & XFS_UQUOTA_ENFD) ?
+                        seq_puts(m, "," MNTOPT_USRQUOTA) :
+                        seq_puts(m, "," MNTOPT_UQUOTANOENF);
+        }
+        if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+                (mp->m_qflags & XFS_GQUOTA_ENFD) ?
+                        seq_puts(m, "," MNTOPT_GRPQUOTA) :
+                        seq_puts(m, "," MNTOPT_GQUOTANOENF);
+        }
+        if (!(mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT)))
+                seq_puts(m, "," MNTOPT_NOQUOTA);
+        PVFS_SHOWARGS(BHV_NEXT(bhv), m, error);
+        return error;
+}
+STATIC int
+xfs_qm_mount(
+        struct bhv_desc         *bhv,
+        struct xfs_mount_args   *args,
+        struct cred             *cr)
+{
+        struct vfs              *vfsp = bhvtovfs(bhv);
+        struct xfs_mount        *mp = XFS_VFSTOM(vfsp);
+        int                     error;
+        if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA))
+                xfs_qm_mount_quotainit(mp, args->flags);
+        PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error);
+        return error;
+}
+STATIC int
+xfs_qm_syncall(
+        struct bhv_desc         *bhv,
+        int                     flags,
+        cred_t                  *credp)
+{
+        struct vfs              *vfsp = bhvtovfs(bhv);
+        struct xfs_mount        *mp = XFS_VFSTOM(vfsp);
+        int                     error;
+        /*
+         * Get the Quota Manager to flush the dquots.
+         */
+        if (XFS_IS_QUOTA_ON(mp)) {
+                if ((error = xfs_qm_sync(mp, flags))) {
+                        /*
+                         * If we got an IO error, we will be shutting down.
+                         * So, there's nothing more for us to do here.
+                         */
+                        ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
+                        if (XFS_FORCED_SHUTDOWN(mp)) {
+                                return XFS_ERROR(error);
+                        }
+                }
+        }
+        PVFS_SYNC(BHV_NEXT(bhv), flags, credp, error);
+        return error;
+}
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+void
+xfs_mount_reset_sbqflags(
+        xfs_mount_t             *mp)
+{
+        xfs_trans_t             *tp;
+        unsigned long           s;
+        mp->m_qflags = 0;
+        /*
+         * It is OK to look at sb_qflags here in mount path,
+         * without SB_LOCK.
+         */
+        if (mp->m_sb.sb_qflags == 0)
+                return;
+        s = XFS_SB_LOCK(mp);
+        mp->m_sb.sb_qflags = 0;
+        XFS_SB_UNLOCK(mp, s);
+        /*
+         * if the fs is readonly, let the incore superblock run
+         * with quotas off but don't flush the update out to disk
+         */
+        if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+                return;
+#ifdef QUOTADEBUG
+        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                      XFS_DEFAULT_LOG_COUNT)) {
+                xfs_trans_cancel(tp, 0);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_mount_reset_sbqflags: Superblock update failed!");
+                return;
+        }
+        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        xfs_trans_commit(tp, 0, NULL);
+}
+STATIC int
+xfs_qm_newmount(
+        xfs_mount_t     *mp,
+        uint            *needquotamount,
+        uint            *quotaflags)
+{
+        uint            quotaondisk;
+        uint            uquotaondisk = 0, gquotaondisk = 0;
+        *quotaflags = 0;
+        *needquotamount = B_FALSE;
+        quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+                mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT);
+        if (quotaondisk) {
+                uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
+                gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
+        }
+        /*
+         * If the device itself is read-only, we can't allow
+         * the user to change the state of quota on the mount -
+         * this would generate a transaction on the ro device,
+         * which would lead to an I/O error and shutdown
+         */
+        if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
+            (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
+             (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
+            (!gquotaondisk &&  XFS_IS_GQUOTA_ON(mp)))  &&
+            xfs_dev_is_read_only(mp, "changing quota state")) {
+                cmn_err(CE_WARN,
+                        "XFS: please mount with%s%s%s.",
+                        (!quotaondisk ? "out quota" : ""),
+                        (uquotaondisk ? " usrquota" : ""),
+                        (gquotaondisk ? " grpquota" : ""));
+                return XFS_ERROR(EPERM);
+        }
+        if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+                /*
+                 * Call mount_quotas at this point only if we won't have to do
+                 * a quotacheck.
+                 */
+                if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
+                        /*
+                         * If an error occured, qm_mount_quotas code
+                         * has already disabled quotas. So, just finish
+                         * mounting, and get on with the boring life
+                         * without disk quotas.
+                         */
+                        xfs_qm_mount_quotas(mp, 0);
+                } else {
+                        /*
+                         * Clear the quota flags, but remember them. This
+                         * is so that the quota code doesn't get invoked
+                         * before we're ready. This can happen when an
+                         * inode goes inactive and wants to free blocks,
+                         * or via xfs_log_mount_finish.
+                         */
+                        *needquotamount = B_TRUE;
+                        *quotaflags = mp->m_qflags;
+                        mp->m_qflags = 0;
+                }
+        }
+        return 0;
+}
+STATIC int
+xfs_qm_endmount(
+        xfs_mount_t     *mp,
+        uint            needquotamount,
+        uint            quotaflags,
+        int             mfsi_flags)
+{
+        if (needquotamount) {
+                ASSERT(mp->m_qflags == 0);
+                mp->m_qflags = quotaflags;
+                xfs_qm_mount_quotas(mp, mfsi_flags);
+        }
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+        if (! (XFS_IS_QUOTA_ON(mp)))
+                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
+        else
+                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
+#endif
+#ifdef QUOTADEBUG
+        if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
+                cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
+#endif
+        return 0;
+}
+STATIC void
+xfs_qm_dqrele_null(
+        xfs_dquot_t     *dq)
+{
+        /*
+         * Called from XFS, where we always check first for a NULL dquot.
+         */
+        if (!dq)
+                return;
+        xfs_qm_dqrele(dq);
+}
+struct xfs_qmops xfs_qmcore_xfs = {
+        .xfs_qminit             = xfs_qm_newmount,
+        .xfs_qmdone             = xfs_qm_unmount_quotadestroy,
+        .xfs_qmmount            = xfs_qm_endmount,
+        .xfs_qmunmount          = xfs_qm_unmount_quotas,
+        .xfs_dqrele             = xfs_qm_dqrele_null,
+        .xfs_dqattach           = xfs_qm_dqattach,
+        .xfs_dqdetach           = xfs_qm_dqdetach,
+        .xfs_dqpurgeall         = xfs_qm_dqpurge_all,
+        .xfs_dqvopalloc         = xfs_qm_vop_dqalloc,
+        .xfs_dqvopcreate        = xfs_qm_vop_dqattach_and_dqmod_newinode,
+        .xfs_dqvoprename        = xfs_qm_vop_rename_dqattach,
+        .xfs_dqvopchown         = xfs_qm_vop_chown,
+        .xfs_dqvopchownresv     = xfs_qm_vop_chown_reserve,
+        .xfs_dqtrxops           = &xfs_trans_dquot_ops,
+};
+struct bhv_vfsops xfs_qmops = { {
+        BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM),
+        .vfs_parseargs          = xfs_qm_parseargs,
+        .vfs_showargs           = xfs_qm_showargs,
+        .vfs_mount              = xfs_qm_mount,
+        .vfs_sync               = xfs_qm_syncall,
+        .vfs_quotactl           = xfs_qm_quotactl, },
+};
+void __init
+xfs_qm_init(void)
+{
+        static char     message[] __initdata =
+                KERN_INFO "SGI XFS Quota Management subsystem\n";
+        printk(message);
+        mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock");
+        vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs);
+        xfs_qm_init_procfs();
+}
+void __exit
+xfs_qm_exit(void)
+{
+        vfs_bhv_clr_custom(&xfs_qmops);
+        xfs_qm_cleanup_procfs();
+        if (qm_dqzone)
+                kmem_cache_destroy(qm_dqzone);
+        if (qm_dqtrxzone)
+                kmem_cache_destroy(qm_dqtrxzone);
+}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
new file mode 100644
index 000000000000..29978e037fee
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+struct xqmstats xqmstats;
+STATIC int
+xfs_qm_read_xfsquota(
+        char            *buffer,
+        char            **start,
+        off_t           offset,
+        int             count,
+        int             *eof,
+        void            *data)
+{
+        int             len;
+        /* maximum; incore; ratio free to inuse; freelist */
+        len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+                        ndquot,
+                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
+                        xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+                        xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+        if (offset >= len) {
+                *start = buffer;
+                *eof = 1;
+                return 0;
+        }
+        *start = buffer + offset;
+        if ((len -= offset) > count)
+                return count;
+        *eof = 1;
+        return len;
+}
+STATIC int
+xfs_qm_read_stats(
+        char            *buffer,
+        char            **start,
+        off_t           offset,
+        int             count,
+        int             *eof,
+        void            *data)
+{
+        int             len;
+        /* quota performance statistics */
+        len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+                        xqmstats.xs_qm_dqreclaims,
+                        xqmstats.xs_qm_dqreclaim_misses,
+                        xqmstats.xs_qm_dquot_dups,
+                        xqmstats.xs_qm_dqcachemisses,
+                        xqmstats.xs_qm_dqcachehits,
+                        xqmstats.xs_qm_dqwants,
+                        xqmstats.xs_qm_dqshake_reclaims,
+                        xqmstats.xs_qm_dqinact_reclaims);
+        if (offset >= len) {
+                *start = buffer;
+                *eof = 1;
+                return 0;
+        }
+        *start = buffer + offset;
+        if ((len -= offset) > count)
+                return count;
+        *eof = 1;
+        return len;
+}
+void
+xfs_qm_init_procfs(void)
+{
+        create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
+        create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+}
+void
+xfs_qm_cleanup_procfs(void)
+{
+        remove_proc_entry("fs/xfs/xqm", NULL);
+        remove_proc_entry("fs/xfs/xqmstat", NULL);
+}
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
new file mode 100644
index 000000000000..8093c5c284ec
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QM_STATS_H__
+#define __XFS_QM_STATS_H__
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+/*
+ * XQM global statistics
+ */
+struct xqmstats {
+        __uint32_t              xs_qm_dqreclaims;
+        __uint32_t              xs_qm_dqreclaim_misses;
+        __uint32_t              xs_qm_dquot_dups;
+        __uint32_t              xs_qm_dqcachemisses;
+        __uint32_t              xs_qm_dqcachehits;
+        __uint32_t              xs_qm_dqwants;
+        __uint32_t              xs_qm_dqshake_reclaims;
+        __uint32_t              xs_qm_dqinact_reclaims;
+};
+extern struct xqmstats xqmstats;
+# define XQM_STATS_INC(count)   ( (count)++ )
+extern void xfs_qm_init_procfs(void);
+extern void xfs_qm_cleanup_procfs(void);
+#else
+# define XQM_STATS_INC(count)   do { } while (0)
+static __inline void xfs_qm_init_procfs(void) { };
+static __inline void xfs_qm_cleanup_procfs(void) { };
+#endif
+#endif  /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
new file mode 100644
index 000000000000..229f5b5a2d25
--- /dev/null
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -0,0 +1,1458 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+#ifdef DEBUG
+# define qdprintk(s, args...)   cmn_err(CE_DEBUG, s, ## args)
+#else
+# define qdprintk(s, args...)   do { } while (0)
+#endif
+STATIC int      xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+STATIC int      xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+                                        fs_disk_quota_t *);
+STATIC int      xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+STATIC int      xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+                                        fs_disk_quota_t *);
+STATIC int      xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+STATIC int      xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
+STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
+STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
+                                        uint);
+STATIC uint     xfs_qm_import_flags(uint);
+STATIC uint     xfs_qm_export_flags(uint);
+STATIC uint     xfs_qm_import_qtype_flags(uint);
+STATIC uint     xfs_qm_export_qtype_flags(uint);
+STATIC void     xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
+                                        fs_disk_quota_t *);
+/*
+ * The main distribution switch of all XFS quotactl system calls.
+ */
+int
+xfs_qm_quotactl(
+        struct bhv_desc *bdp,
+        int             cmd,
+        int             id,
+        xfs_caddr_t     addr)
+{
+        xfs_mount_t     *mp;
+        int             error;
+        struct vfs      *vfsp;
+        vfsp = bhvtovfs(bdp);
+        mp = XFS_VFSTOM(vfsp);
+        if (addr == NULL && cmd != Q_SYNC)
+                return XFS_ERROR(EINVAL);
+        if (id < 0 && cmd != Q_SYNC)
+                return XFS_ERROR(EINVAL);
+        /*
+         * The following commands are valid even when quotaoff.
+         */
+        switch (cmd) {
+                /*
+                 * truncate quota files. quota must be off.
+                 */
+              case Q_XQUOTARM:
+                if (XFS_IS_QUOTA_ON(mp) || addr == NULL)
+                        return XFS_ERROR(EINVAL);
+                if (vfsp->vfs_flag & VFS_RDONLY)
+                        return XFS_ERROR(EROFS);
+                return (xfs_qm_scall_trunc_qfiles(mp,
+                               xfs_qm_import_qtype_flags(*(uint *)addr)));
+                /*
+                 * Get quota status information.
+                 */
+              case Q_XGETQSTAT:
+                return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr));
+                /*
+                 * QUOTAON for root f/s and quota enforcement on others..
+                 * Quota accounting for non-root f/s's must be turned on
+                 * at mount time.
+                 */
+              case Q_XQUOTAON:
+                if (addr == NULL)
+                        return XFS_ERROR(EINVAL);
+                if (vfsp->vfs_flag & VFS_RDONLY)
+                        return XFS_ERROR(EROFS);
+                return (xfs_qm_scall_quotaon(mp,
+                                          xfs_qm_import_flags(*(uint *)addr)));
+              case Q_XQUOTAOFF:
+                if (vfsp->vfs_flag & VFS_RDONLY)
+                        return XFS_ERROR(EROFS);
+                break;
+              default:
+                break;
+        }
+        if (! XFS_IS_QUOTA_ON(mp))
+                return XFS_ERROR(ESRCH);
+        switch (cmd) {
+              case Q_XQUOTAOFF:
+                if (vfsp->vfs_flag & VFS_RDONLY)
+                        return XFS_ERROR(EROFS);
+                error = xfs_qm_scall_quotaoff(mp,
+                                            xfs_qm_import_flags(*(uint *)addr),
+                                            B_FALSE);
+                break;
+                /*
+                 * Defaults to XFS_GETUQUOTA.
+                 */
+              case Q_XGETQUOTA:
+                error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER,
+                                        (fs_disk_quota_t *)addr);
+                break;
+                /*
+                 * Set limits, both hard and soft. Defaults to Q_SETUQLIM.
+                 */
+              case Q_XSETQLIM:
+                if (vfsp->vfs_flag & VFS_RDONLY)
+                        return XFS_ERROR(EROFS);
+                error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER,
+                                             (fs_disk_quota_t *)addr);
+                break;
+               case Q_XSETGQLIM:
+                if (vfsp->vfs_flag & VFS_RDONLY)
+                        return XFS_ERROR(EROFS);
+                error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
+                                             (fs_disk_quota_t *)addr);
+                break;
+              case Q_XGETGQUOTA:
+                error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP,
+                                        (fs_disk_quota_t *)addr);
+                break;
+                /*
+                 * Quotas are entirely undefined after quotaoff in XFS quotas.
+                 * For instance, there's no way to set limits when quotaoff.
+                 */
+              default:
+                error = XFS_ERROR(EINVAL);
+                break;
+        }
+        return (error);
+}
+/*
+ * Turn off quota accounting and/or enforcement for all udquots and/or
+ * gdquots. Called only at unmount time.
+ *
+ * This assumes that there are no dquots of this file system cached
+ * incore, and modifies the ondisk dquot directly. Therefore, for example,
+ * it is an error to call this twice, without purging the cache.
+ */
+STATIC int
+xfs_qm_scall_quotaoff(
+        xfs_mount_t             *mp,
+        uint                    flags,
+        boolean_t               force)
+{
+        uint                    dqtype;
+        unsigned long   s;
+        int                     error;
+        uint                    inactivate_flags;
+        xfs_qoff_logitem_t      *qoffstart;
+        int                     nculprits;
+        if (!force && !capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
+        /*
+         * No file system can have quotas enabled on disk but not in core.
+         * Note that quota utilities (like quotaoff) _expect_
+         * errno == EEXIST here.
+         */
+        if ((mp->m_qflags & flags) == 0)
+                return XFS_ERROR(EEXIST);
+        error = 0;
+        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+        /*
+         * We don't want to deal with two quotaoffs messing up each other,
+         * so we're going to serialize it. quotaoff isn't exactly a performance
+         * critical thing.
+         * If quotaoff, then we must be dealing with the root filesystem.
+         */
+        ASSERT(mp->m_quotainfo);
+        if (mp->m_quotainfo)
+                mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+        ASSERT(mp->m_quotainfo);
+        /*
+         * If we're just turning off quota enforcement, change mp and go.
+         */
+        if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
+                mp->m_qflags &= ~(flags);
+                s = XFS_SB_LOCK(mp);
+                mp->m_sb.sb_qflags = mp->m_qflags;
+                XFS_SB_UNLOCK(mp, s);
+                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                /* XXX what to do if error ? Revert back to old vals incore ? */
+                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+                return (error);
+        }
+        dqtype = 0;
+        inactivate_flags = 0;
+        /*
+         * If accounting is off, we must turn enforcement off, clear the
+         * quota 'CHKD' certificate to make it known that we have to
+         * do a quotacheck the next time this quota is turned on.
+         */
+        if (flags & XFS_UQUOTA_ACCT) {
+                dqtype |= XFS_QMOPT_UQUOTA;
+                flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
+                inactivate_flags |= XFS_UQUOTA_ACTIVE;
+        }
+        if (flags & XFS_GQUOTA_ACCT) {
+                dqtype |= XFS_QMOPT_GQUOTA;
+                flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
+                inactivate_flags |= XFS_GQUOTA_ACTIVE;
+        }
+        /*
+         * Nothing to do?  Don't complain. This happens when we're just
+         * turning off quota enforcement.
+         */
+        if ((mp->m_qflags & flags) == 0) {
+                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                return (0);
+        }
+        /*
+         * Write the LI_QUOTAOFF log record, and do SB changes atomically,
+         * and synchronously.
+         */
+        xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+        /*
+         * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
+         * to take care of the race between dqget and quotaoff. We don't take
+         * any special locks to reset these bits. All processes need to check
+         * these bits *after* taking inode lock(s) to see if the particular
+         * quota type is in the process of being turned off. If *ACTIVE, it is
+         * guaranteed that all dquot structures and all quotainode ptrs will all
+         * stay valid as long as that inode is kept locked.
+         *
+         * There is no turning back after this.
+         */
+        mp->m_qflags &= ~inactivate_flags;
+        /*
+         * Give back all the dquot reference(s) held by inodes.
+         * Here we go thru every single incore inode in this file system, and
+         * do a dqrele on the i_udquot/i_gdquot that it may have.
+         * Essentially, as long as somebody has an inode locked, this guarantees
+         * that quotas will not be turned off. This is handy because in a
+         * transaction once we lock the inode(s) and check for quotaon, we can
+         * depend on the quota inodes (and other things) being valid as long as
+         * we keep the lock(s).
+         */
+        xfs_qm_dqrele_all_inodes(mp, flags);
+        /*
+         * Next we make the changes in the quota flag in the mount struct.
+         * This isn't protected by a particular lock directly, because we
+         * don't want to take a mrlock everytime we depend on quotas being on.
+         */
+        mp->m_qflags &= ~(flags);
+        /*
+         * Go through all the dquots of this file system and purge them,
+         * according to what was turned off. We may not be able to get rid
+         * of all dquots, because dquots can have temporary references that
+         * are not attached to inodes. eg. xfs_setattr, xfs_create.
+         * So, if we couldn't purge all the dquots from the filesystem,
+         * we can't get rid of the incore data structures.
+         */
+        while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF)))
+                delay(10 * nculprits);
+        /*
+         * Transactions that had started before ACTIVE state bit was cleared
+         * could have logged many dquots, so they'd have higher LSNs than
+         * the first QUOTAOFF log record does. If we happen to crash when
+         * the tail of the log has gone past the QUOTAOFF record, but
+         * before the last dquot modification, those dquots __will__
+         * recover, and that's not good.
+         *
+         * So, we have QUOTAOFF start and end logitems; the start
+         * logitem won't get overwritten until the end logitem appears...
+         */
+        xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+        /*
+         * If quotas is completely disabled, close shop.
+         */
+        if ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_ALL) {
+                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                xfs_qm_destroy_quotainfo(mp);
+                return (0);
+        }
+        /*
+         * Release our quotainode references, and vn_purge them,
+         * if we don't need them anymore.
+         */
+        if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
+                XFS_PURGE_INODE(XFS_QI_UQIP(mp));
+                XFS_QI_UQIP(mp) = NULL;
+        }
+        if ((dqtype & XFS_QMOPT_GQUOTA) && XFS_QI_GQIP(mp)) {
+                XFS_PURGE_INODE(XFS_QI_GQIP(mp));
+                XFS_QI_GQIP(mp) = NULL;
+        }
+        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+        return (error);
+}
+STATIC int
+xfs_qm_scall_trunc_qfiles(
+        xfs_mount_t     *mp,
+        uint            flags)
+{
+        int             error;
+        xfs_inode_t     *qip;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
+        error = 0;
+        if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) {
+                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+                return XFS_ERROR(EINVAL);
+        }
+        if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
+                error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
+                if (! error) {
+                        (void) xfs_truncate_file(mp, qip);
+                        VN_RELE(XFS_ITOV(qip));
+                }
+        }
+        if ((flags & XFS_DQ_GROUP) && mp->m_sb.sb_gquotino != NULLFSINO) {
+                error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
+                if (! error) {
+                        (void) xfs_truncate_file(mp, qip);
+                        VN_RELE(XFS_ITOV(qip));
+                }
+        }
+        return (error);
+}
+/*
+ * Switch on (a given) quota enforcement for a filesystem.  This takes
+ * effect immediately.
+ * (Switching on quota accounting must be done at mount time.)
+ */
+STATIC int
+xfs_qm_scall_quotaon(
+        xfs_mount_t     *mp,
+        uint            flags)
+{
+        int             error;
+        unsigned long s;
+        uint            qf;
+        uint            accflags;
+        __int64_t       sbflags;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
+        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+        /*
+         * Switching on quota accounting must be done at mount time.
+         */
+        accflags = flags & XFS_ALL_QUOTA_ACCT;
+        flags &= ~(XFS_ALL_QUOTA_ACCT);
+        sbflags = 0;
+        if (flags == 0) {
+                qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+                return XFS_ERROR(EINVAL);
+        }
+        /* No fs can turn on quotas with a delayed effect */
+        ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
+        /*
+         * Can't enforce without accounting. We check the superblock
+         * qflags here instead of m_qflags because rootfs can have
+         * quota acct on ondisk without m_qflags' knowing.
+         */
+        if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+            (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+            (flags & XFS_UQUOTA_ENFD))
+            ||
+            ((flags & XFS_GQUOTA_ACCT) == 0 &&
+            (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+            (flags & XFS_GQUOTA_ENFD))) {
+                qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
+                        flags, mp->m_sb.sb_qflags);
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * If everything's upto-date incore, then don't waste time.
+         */
+        if ((mp->m_qflags & flags) == flags)
+                return XFS_ERROR(EEXIST);
+        /*
+         * Change sb_qflags on disk but not incore mp->qflags
+         * if this is the root filesystem.
+         */
+        s = XFS_SB_LOCK(mp);
+        qf = mp->m_sb.sb_qflags;
+        mp->m_sb.sb_qflags = qf | flags;
+        XFS_SB_UNLOCK(mp, s);
+        /*
+         * There's nothing to change if it's the same.
+         */
+        if ((qf & flags) == flags && sbflags == 0)
+                return XFS_ERROR(EEXIST);
+        sbflags |= XFS_SB_QFLAGS;
+        if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+                return (error);
+        /*
+         * If we aren't trying to switch on quota enforcement, we are done.
+         */
+        if  (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+             (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+            (flags & XFS_ALL_QUOTA_ENFD) == 0)
+                return (0);
+        if (! XFS_IS_QUOTA_RUNNING(mp))
+                return XFS_ERROR(ESRCH);
+        /*
+         * Switch on quota enforcement in core.
+         */
+        mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+        mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+        return (0);
+}
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ */
+STATIC int
+xfs_qm_scall_getqstat(
+        xfs_mount_t     *mp,
+        fs_quota_stat_t *out)
+{
+        xfs_inode_t     *uip, *gip;
+        boolean_t       tempuqip, tempgqip;
+        uip = gip = NULL;
+        tempuqip = tempgqip = B_FALSE;
+        memset(out, 0, sizeof(fs_quota_stat_t));
+        out->qs_version = FS_QSTAT_VERSION;
+        if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+                out->qs_uquota.qfs_ino = NULLFSINO;
+                out->qs_gquota.qfs_ino = NULLFSINO;
+                return (0);
+        }
+        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+                                                        (XFS_ALL_QUOTA_ACCT|
+                                                         XFS_ALL_QUOTA_ENFD));
+        out->qs_pad = 0;
+        out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+        if (mp->m_quotainfo) {
+                uip = mp->m_quotainfo->qi_uquotaip;
+                gip = mp->m_quotainfo->qi_gquotaip;
+        }
+        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                                        0, 0, &uip, 0) == 0)
+                        tempuqip = B_TRUE;
+        }
+        if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+                if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                        0, 0, &gip, 0) == 0)
+                        tempgqip = B_TRUE;
+        }
+        if (uip) {
+                out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+                out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+                if (tempuqip)
+                        VN_RELE(XFS_ITOV(uip));
+        }
+        if (gip) {
+                out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+                out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+                if (tempgqip)
+                        VN_RELE(XFS_ITOV(gip));
+        }
+        if (mp->m_quotainfo) {
+                out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
+                out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
+                out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
+                out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
+                out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
+                out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+        }
+        return (0);
+}
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+STATIC int
+xfs_qm_scall_setqlim(
+        xfs_mount_t             *mp,
+        xfs_dqid_t              id,
+        uint                    type,
+        fs_disk_quota_t         *newlim)
+{
+        xfs_disk_dquot_t        *ddq;
+        xfs_dquot_t             *dqp;
+        xfs_trans_t             *tp;
+        int                     error;
+        xfs_qcnt_t              hard, soft;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
+        if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK)) == 0)
+                return (0);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+        if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
+                                      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+                xfs_trans_cancel(tp, 0);
+                return (error);
+        }
+        /*
+         * We don't want to race with a quotaoff so take the quotaoff lock.
+         * (We don't hold an inode lock, so there's nothing else to stop
+         * a quotaoff from happening). (XXXThis doesn't currently happen
+         * because we take the vfslock before calling xfs_qm_sysent).
+         */
+        mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+        /*
+         * Get the dquot (locked), and join it to the transaction.
+         * Allocate the dquot if this doesn't exist.
+         */
+        if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
+                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+                mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+                ASSERT(error != ENOENT);
+                return (error);
+        }
+        xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
+        xfs_trans_dqjoin(tp, dqp);
+        ddq = &dqp->q_core;
+        /*
+         * Make sure that hardlimits are >= soft limits before changing.
+         */
+        hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+                        INT_GET(ddq->d_blk_hardlimit, ARCH_CONVERT);
+        soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+                        INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT);
+        if (hard == 0 || hard >= soft) {
+                INT_SET(ddq->d_blk_hardlimit, ARCH_CONVERT, hard);
+                INT_SET(ddq->d_blk_softlimit, ARCH_CONVERT, soft);
+                if (id == 0) {
+                        mp->m_quotainfo->qi_bhardlimit = hard;
+                        mp->m_quotainfo->qi_bsoftlimit = soft;
+                }
+        } else {
+                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+        }
+        hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+                        INT_GET(ddq->d_rtb_hardlimit, ARCH_CONVERT);
+        soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+                        INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT);
+        if (hard == 0 || hard >= soft) {
+                INT_SET(ddq->d_rtb_hardlimit, ARCH_CONVERT, hard);
+                INT_SET(ddq->d_rtb_softlimit, ARCH_CONVERT, soft);
+                if (id == 0) {
+                        mp->m_quotainfo->qi_rtbhardlimit = hard;
+                        mp->m_quotainfo->qi_rtbsoftlimit = soft;
+                }
+        } else {
+                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+        }
+        hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+                (xfs_qcnt_t) newlim->d_ino_hardlimit :
+                        INT_GET(ddq->d_ino_hardlimit, ARCH_CONVERT);
+        soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+                (xfs_qcnt_t) newlim->d_ino_softlimit :
+                        INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT);
+        if (hard == 0 || hard >= soft) {
+                INT_SET(ddq->d_ino_hardlimit, ARCH_CONVERT, hard);
+                INT_SET(ddq->d_ino_softlimit, ARCH_CONVERT, soft);
+                if (id == 0) {
+                        mp->m_quotainfo->qi_ihardlimit = hard;
+                        mp->m_quotainfo->qi_isoftlimit = soft;
+                }
+        } else {
+                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+        }
+        if (id == 0) {
+                /*
+                 * Timelimits for the super user set the relative time
+                 * the other users can be over quota for this file system.
+                 * If it is zero a default is used.  Ditto for the default
+                 * soft and hard limit values (already done, above).
+                 */
+                if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+                        mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+                        INT_SET(ddq->d_btimer, ARCH_CONVERT, newlim->d_btimer);
+                }
+                if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+                        mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+                        INT_SET(ddq->d_itimer, ARCH_CONVERT, newlim->d_itimer);
+                }
+                if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+                        mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+                        INT_SET(ddq->d_rtbtimer, ARCH_CONVERT, newlim->d_rtbtimer);
+                }
+        } else /* if (XFS_IS_QUOTA_ENFORCED(mp)) */ {
+                /*
+                 * If the user is now over quota, start the timelimit.
+                 * The user will not be 'warned'.
+                 * Note that we keep the timers ticking, whether enforcement
+                 * is on or off. We don't really want to bother with iterating
+                 * over all ondisk dquots and turning the timers on/off.
+                 */
+                xfs_qm_adjust_dqtimers(mp, ddq);
+        }
+        dqp->dq_flags |= XFS_DQ_DIRTY;
+        xfs_trans_log_dquot(tp, dqp);
+        xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
+        xfs_trans_commit(tp, 0, NULL);
+        xfs_qm_dqprint(dqp);
+        xfs_qm_dqrele(dqp);
+        mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+        return (0);
+}
+STATIC int
+xfs_qm_scall_getquota(
+        xfs_mount_t     *mp,
+        xfs_dqid_t      id,
+        uint            type,
+        fs_disk_quota_t *out)
+{
+        xfs_dquot_t     *dqp;
+        int             error;
+        /*
+         * Try to get the dquot. We don't want it allocated on disk, so
+         * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+         * exist, we'll get ENOENT back.
+         */
+        if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
+                return (error);
+        }
+        xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
+        /*
+         * If everything's NULL, this dquot doesn't quite exist as far as
+         * our utility programs are concerned.
+         */
+        if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+                xfs_qm_dqput(dqp);
+                return XFS_ERROR(ENOENT);
+        }
+        /* xfs_qm_dqprint(dqp); */
+        /*
+         * Convert the disk dquot to the exportable format
+         */
+        xfs_qm_export_dquot(mp, &dqp->q_core, out);
+        xfs_qm_dqput(dqp);
+        return (error ? XFS_ERROR(EFAULT) : 0);
+}
+STATIC int
+xfs_qm_log_quotaoff_end(
+        xfs_mount_t             *mp,
+        xfs_qoff_logitem_t      *startqoff,
+        uint                    flags)
+{
+        xfs_trans_t            *tp;
+        int                     error;
+        xfs_qoff_logitem_t     *qoffi;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+        if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
+                                      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+                xfs_trans_cancel(tp, 0);
+                return (error);
+        }
+        qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+                                        flags & XFS_ALL_QUOTA_ACCT);
+        xfs_trans_log_quotaoff_item(tp, qoffi);
+        /*
+         * We have to make sure that the transaction is secure on disk before we
+         * return and actually stop quota accounting. So, make it synchronous.
+         * We don't care about quotoff's performance.
+         */
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0, NULL);
+        return (error);
+}
+STATIC int
+xfs_qm_log_quotaoff(
+        xfs_mount_t            *mp,
+        xfs_qoff_logitem_t     **qoffstartp,
+        uint                   flags)
+{
+        xfs_trans_t            *tp;
+        int                     error;
+        unsigned long   s;
+        xfs_qoff_logitem_t     *qoffi=NULL;
+        uint                    oldsbqflag=0;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+        if ((error = xfs_trans_reserve(tp, 0,
+                                      sizeof(xfs_qoff_logitem_t) * 2 +
+                                      mp->m_sb.sb_sectsize + 128,
+                                      0,
+                                      0,
+                                      XFS_DEFAULT_LOG_COUNT))) {
+                goto error0;
+        }
+        qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+        xfs_trans_log_quotaoff_item(tp, qoffi);
+        s = XFS_SB_LOCK(mp);
+        oldsbqflag = mp->m_sb.sb_qflags;
+        mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+        XFS_SB_UNLOCK(mp, s);
+        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        /*
+         * We have to make sure that the transaction is secure on disk before we
+         * return and actually stop quota accounting. So, make it synchronous.
+         * We don't care about quotoff's performance.
+         */
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0, NULL);
+error0:
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                /*
+                 * No one else is modifying sb_qflags, so this is OK.
+                 * We still hold the quotaofflock.
+                 */
+                s = XFS_SB_LOCK(mp);
+                mp->m_sb.sb_qflags = oldsbqflag;
+                XFS_SB_UNLOCK(mp, s);
+        }
+        *qoffstartp = qoffi;
+        return (error);
+}
+/*
+ * Translate an internal style on-disk-dquot to the exportable format.
+ * The main differences are that the counters/limits are all in Basic
+ * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
+ * to be converted to the native endianness.
+ */
+STATIC void
+xfs_qm_export_dquot(
+        xfs_mount_t             *mp,
+        xfs_disk_dquot_t        *src,
+        struct fs_disk_quota    *dst)
+{
+        memset(dst, 0, sizeof(*dst));
+        dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
+        dst->d_flags =
+                xfs_qm_export_qtype_flags(INT_GET(src->d_flags, ARCH_CONVERT));
+        dst->d_id = INT_GET(src->d_id, ARCH_CONVERT);
+        dst->d_blk_hardlimit = (__uint64_t)
+                XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_hardlimit, ARCH_CONVERT));
+        dst->d_blk_softlimit = (__uint64_t)
+                XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_softlimit, ARCH_CONVERT));
+        dst->d_ino_hardlimit = (__uint64_t)
+                INT_GET(src->d_ino_hardlimit, ARCH_CONVERT);
+        dst->d_ino_softlimit = (__uint64_t)
+                INT_GET(src->d_ino_softlimit, ARCH_CONVERT);
+        dst->d_bcount = (__uint64_t)
+                XFS_FSB_TO_BB(mp, INT_GET(src->d_bcount, ARCH_CONVERT));
+        dst->d_icount = (__uint64_t) INT_GET(src->d_icount, ARCH_CONVERT);
+        dst->d_btimer = (__uint32_t) INT_GET(src->d_btimer, ARCH_CONVERT);
+        dst->d_itimer = (__uint32_t) INT_GET(src->d_itimer, ARCH_CONVERT);
+        dst->d_iwarns = INT_GET(src->d_iwarns, ARCH_CONVERT);
+        dst->d_bwarns = INT_GET(src->d_bwarns, ARCH_CONVERT);
+        dst->d_rtb_hardlimit = (__uint64_t)
+                XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_hardlimit, ARCH_CONVERT));
+        dst->d_rtb_softlimit = (__uint64_t)
+                XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_softlimit, ARCH_CONVERT));
+        dst->d_rtbcount = (__uint64_t)
+                XFS_FSB_TO_BB(mp, INT_GET(src->d_rtbcount, ARCH_CONVERT));
+        dst->d_rtbtimer = (__uint32_t) INT_GET(src->d_rtbtimer, ARCH_CONVERT);
+        dst->d_rtbwarns = INT_GET(src->d_rtbwarns, ARCH_CONVERT);
+        /*
+         * Internally, we don't reset all the timers when quota enforcement
+         * gets turned off. No need to confuse the userlevel code,
+         * so return zeroes in that case.
+         */
+        if (! XFS_IS_QUOTA_ENFORCED(mp)) {
+                dst->d_btimer = 0;
+                dst->d_itimer = 0;
+                dst->d_rtbtimer = 0;
+        }
+#ifdef DEBUG
+        if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) {
+                if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+                    (dst->d_blk_softlimit > 0)) {
+                        ASSERT(dst->d_btimer != 0);
+                }
+                if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+                    (dst->d_ino_softlimit > 0)) {
+                        ASSERT(dst->d_itimer != 0);
+                }
+        }
+#endif
+}
+STATIC uint
+xfs_qm_import_qtype_flags(
+        uint uflags)
+{
+        /*
+         * Can't be both at the same time.
+         */
+        if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
+             (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
+            ((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == 0))
+                return (0);
+        return (uflags & XFS_USER_QUOTA) ?
+                XFS_DQ_USER : XFS_DQ_GROUP;
+}
+STATIC uint
+xfs_qm_export_qtype_flags(
+        uint flags)
+{
+        /*
+         * Can't be both at the same time.
+         */
+        ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) !=
+                (XFS_GROUP_QUOTA | XFS_USER_QUOTA));
+        ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != 0);
+        return (flags & XFS_DQ_USER) ?
+                XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+}
+STATIC uint
+xfs_qm_import_flags(
+        uint uflags)
+{
+        uint flags = 0;
+        if (uflags & XFS_QUOTA_UDQ_ACCT)
+                flags |= XFS_UQUOTA_ACCT;
+        if (uflags & XFS_QUOTA_GDQ_ACCT)
+                flags |= XFS_GQUOTA_ACCT;
+        if (uflags & XFS_QUOTA_UDQ_ENFD)
+                flags |= XFS_UQUOTA_ENFD;
+        if (uflags & XFS_QUOTA_GDQ_ENFD)
+                flags |= XFS_GQUOTA_ENFD;
+        return (flags);
+}
+STATIC uint
+xfs_qm_export_flags(
+        uint flags)
+{
+        uint uflags;
+        uflags = 0;
+        if (flags & XFS_UQUOTA_ACCT)
+                uflags |= XFS_QUOTA_UDQ_ACCT;
+        if (flags & XFS_GQUOTA_ACCT)
+                uflags |= XFS_QUOTA_GDQ_ACCT;
+        if (flags & XFS_UQUOTA_ENFD)
+                uflags |= XFS_QUOTA_UDQ_ENFD;
+        if (flags & XFS_GQUOTA_ENFD)
+                uflags |= XFS_QUOTA_GDQ_ENFD;
+        return (uflags);
+}
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff. This also gets called from
+ * xfs_rootumount.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+        struct xfs_mount *mp,
+        uint             flags)
+{
+        vmap_t          vmap;
+        xfs_inode_t     *ip, *topino;
+        uint            ireclaims;
+        vnode_t         *vp;
+        boolean_t       vnode_refd;
+        ASSERT(mp->m_quotainfo);
+again:
+        XFS_MOUNT_ILOCK(mp);
+        ip = mp->m_inodes;
+        if (ip == NULL) {
+                XFS_MOUNT_IUNLOCK(mp);
+                return;
+        }
+        do {
+                /* Skip markers inserted by xfs_sync */
+                if (ip->i_mount == NULL) {
+                        ip = ip->i_mnext;
+                        continue;
+                }
+                /* Root inode, rbmip and rsumip have associated blocks */
+                if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
+                        ASSERT(ip->i_udquot == NULL);
+                        ASSERT(ip->i_gdquot == NULL);
+                        ip = ip->i_mnext;
+                        continue;
+                }
+                vp = XFS_ITOV_NULL(ip);
+                if (!vp) {
+                        ASSERT(ip->i_udquot == NULL);
+                        ASSERT(ip->i_gdquot == NULL);
+                        ip = ip->i_mnext;
+                        continue;
+                }
+                vnode_refd = B_FALSE;
+                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
+                        /*
+                         * Sample vp mapping while holding the mplock, lest
+                         * we come across a non-existent vnode.
+                         */
+                        VMAP(vp, vmap);
+                        ireclaims = mp->m_ireclaims;
+                        topino = mp->m_inodes;
+                        XFS_MOUNT_IUNLOCK(mp);
+                        /* XXX restart limit ? */
+                        if ( ! (vp = vn_get(vp, &vmap)))
+                                goto again;
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        vnode_refd = B_TRUE;
+                } else {
+                        ireclaims = mp->m_ireclaims;
+                        topino = mp->m_inodes;
+                        XFS_MOUNT_IUNLOCK(mp);
+                }
+                /*
+                 * We don't keep the mountlock across the dqrele() call,
+                 * since it can take a while..
+                 */
+                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+                        xfs_qm_dqrele(ip->i_udquot);
+                        ip->i_udquot = NULL;
+                }
+                if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+                        xfs_qm_dqrele(ip->i_gdquot);
+                        ip->i_gdquot = NULL;
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                /*
+                 * Wait until we've dropped the ilock and mountlock to
+                 * do the vn_rele. Or be condemned to an eternity in the
+                 * inactive code in hell.
+                 */
+                if (vnode_refd)
+                        VN_RELE(vp);
+                XFS_MOUNT_ILOCK(mp);
+                /*
+                 * If an inode was inserted or removed, we gotta
+                 * start over again.
+                 */
+                if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
+                        /* XXX use a sentinel */
+                        XFS_MOUNT_IUNLOCK(mp);
+                        goto again;
+                }
+                ip = ip->i_mnext;
+        } while (ip != mp->m_inodes);
+        XFS_MOUNT_IUNLOCK(mp);
+}
+/*------------------------------------------------------------------------*/
+#ifdef DEBUG
+/*
+ * This contains all the test functions for XFS disk quotas.
+ * Currently it does a quota accounting check. ie. it walks through
+ * all inodes in the file system, calculating the dquot accounting fields,
+ * and prints out any inconsistencies.
+ */
+xfs_dqhash_t *qmtest_udqtab;
+xfs_dqhash_t *qmtest_gdqtab;
+int           qmtest_hashmask;
+int           qmtest_nfails;
+mutex_t       qcheck_lock;
+#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+                                 (__psunsigned_t)(id)) & \
+                                (qmtest_hashmask - 1))
+#define DQTEST_HASH(mp, id, type)   ((type & XFS_DQ_USER) ? \
+                                     (qmtest_udqtab + \
+                                      DQTEST_HASHVAL(mp, id)) : \
+                                     (qmtest_gdqtab + \
+                                      DQTEST_HASHVAL(mp, id)))
+#define DQTEST_LIST_PRINT(l, NXT, title) \
+{ \
+          xfs_dqtest_t  *dqp; int i = 0;\
+          cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+          for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
+               dqp = (xfs_dqtest_t *)dqp->NXT) { \
+                cmn_err(CE_DEBUG, "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
+                         ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),      \
+                         dqp->d_bcount, dqp->d_icount); } \
+}
+typedef struct dqtest {
+        xfs_dqmarker_t  q_lists;
+        xfs_dqhash_t    *q_hash;        /* the hashchain header */
+        xfs_mount_t     *q_mount;       /* filesystem this relates to */
+        xfs_dqid_t      d_id;           /* user id or group id */
+        xfs_qcnt_t      d_bcount;       /* # disk blocks owned by the user */
+        xfs_qcnt_t      d_icount;       /* # inodes owned by the user */
+} xfs_dqtest_t;
+STATIC void
+xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
+{
+        xfs_dquot_t *d;
+        if (((d) = (h)->qh_next))
+                (d)->HL_PREVP = &((dqp)->HL_NEXT);
+        (dqp)->HL_NEXT = d;
+        (dqp)->HL_PREVP = &((h)->qh_next);
+        (h)->qh_next = (xfs_dquot_t *)dqp;
+        (h)->qh_version++;
+        (h)->qh_nelems++;
+}
+STATIC void
+xfs_qm_dqtest_print(
+        xfs_dqtest_t    *d)
+{
+        cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
+        cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
+        cmn_err(CE_DEBUG, "---- type     = %s", XFS_QM_ISUDQ(d)? "USR" : "GRP");
+        cmn_err(CE_DEBUG, "---- fs       = 0x%p", d->q_mount);
+        cmn_err(CE_DEBUG, "---- bcount   = %Lu (0x%x)",
+                d->d_bcount, (int)d->d_bcount);
+        cmn_err(CE_DEBUG, "---- icount   = %Lu (0x%x)",
+                d->d_icount, (int)d->d_icount);
+        cmn_err(CE_DEBUG, "---------------------------");
+}
+STATIC void
+xfs_qm_dqtest_failed(
+        xfs_dqtest_t    *d,
+        xfs_dquot_t     *dqp,
+        char            *reason,
+        xfs_qcnt_t      a,
+        xfs_qcnt_t      b,
+        int             error)
+{
+        qmtest_nfails++;
+        if (error)
+                cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
+                       INT_GET(d->d_id, ARCH_CONVERT), error, reason);
+        else
+                cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
+                       INT_GET(d->d_id, ARCH_CONVERT), reason, (int)a, (int)b);
+        xfs_qm_dqtest_print(d);
+        if (dqp)
+                xfs_qm_dqprint(dqp);
+}
+STATIC int
+xfs_dqtest_cmp2(
+        xfs_dqtest_t    *d,
+        xfs_dquot_t     *dqp)
+{
+        int err = 0;
+        if (INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) != d->d_icount) {
+                xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
+                        INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
+                        d->d_icount, 0);
+                err++;
+        }
+        if (INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) != d->d_bcount) {
+                xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
+                        INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
+                        d->d_bcount, 0);
+                err++;
+        }
+        if (INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT) &&
+            INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) >=
+            INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT)) {
+                if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
+                        cmn_err(CE_DEBUG,
+                                "%d [%s] [0x%p] BLK TIMER NOT STARTED",
+                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                        err++;
+                }
+        }
+        if (INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) &&
+            INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >=
+            INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)) {
+                if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
+                        cmn_err(CE_DEBUG,
+                                "%d [%s] [0x%p] INO TIMER NOT STARTED",
+                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                        err++;
+                }
+        }
+#ifdef QUOTADEBUG
+        if (!err) {
+                cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
+                        d->d_id, XFS_QM_ISUDQ(d) ? "USR" : "GRP", d->q_mount);
+        }
+#endif
+        return (err);
+}
+STATIC void
+xfs_dqtest_cmp(
+        xfs_dqtest_t    *d)
+{
+        xfs_dquot_t     *dqp;
+        int             error;
+        /* xfs_qm_dqtest_print(d); */
+        if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
+                                 &dqp))) {
+                xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
+                return;
+        }
+        xfs_dqtest_cmp2(d, dqp);
+        xfs_qm_dqput(dqp);
+}
+STATIC int
+xfs_qm_internalqcheck_dqget(
+        xfs_mount_t     *mp,
+        xfs_dqid_t      id,
+        uint            type,
+        xfs_dqtest_t    **O_dq)
+{
+        xfs_dqtest_t    *d;
+        xfs_dqhash_t    *h;
+        h = DQTEST_HASH(mp, id, type);
+        for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
+             d = (xfs_dqtest_t *) d->HL_NEXT) {
+                /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
+                if (d->d_id == id && mp == d->q_mount) {
+                        *O_dq = d;
+                        return (0);
+                }
+        }
+        d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
+        d->dq_flags = type;
+        d->d_id = id;
+        d->q_mount = mp;
+        d->q_hash = h;
+        xfs_qm_hashinsert(h, d);
+        *O_dq = d;
+        return (0);
+}
+STATIC void
+xfs_qm_internalqcheck_get_dquots(
+        xfs_mount_t     *mp,
+        xfs_dqid_t      uid,
+        xfs_dqid_t      gid,
+        xfs_dqtest_t    **ud,
+        xfs_dqtest_t    **gd)
+{
+        if (XFS_IS_UQUOTA_ON(mp))
+                xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
+        if (XFS_IS_GQUOTA_ON(mp))
+                xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
+}
+STATIC void
+xfs_qm_internalqcheck_dqadjust(
+        xfs_inode_t             *ip,
+        xfs_dqtest_t            *d)
+{
+        d->d_icount++;
+        d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
+}
+STATIC int
+xfs_qm_internalqcheck_adjust(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        void            __user *buffer, /* not used */
+        int             ubsize,         /* not used */
+        void            *private_data,  /* not used */
+        xfs_daddr_t     bno,            /* starting block of inode cluster */
+        int             *ubused,        /* not used */
+        void            *dip,           /* not used */
+        int             *res)           /* bulkstat result code */
+{
+        xfs_inode_t             *ip;
+        xfs_dqtest_t            *ud, *gd;
+        uint                    lock_flags;
+        boolean_t               ipreleased;
+        int                     error;
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+                *res = BULKSTAT_RV_NOTHING;
+                qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
+                        (unsigned long long) ino,
+                        (unsigned long long) mp->m_sb.sb_uquotino,
+                        (unsigned long long) mp->m_sb.sb_gquotino);
+                return XFS_ERROR(EINVAL);
+        }
+        ipreleased = B_FALSE;
+ again:
+        lock_flags = XFS_ILOCK_SHARED;
+        if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) {
+                *res = BULKSTAT_RV_NOTHING;
+                return (error);
+        }
+        if (ip->i_d.di_mode == 0) {
+                xfs_iput_new(ip, lock_flags);
+                *res = BULKSTAT_RV_NOTHING;
+                return XFS_ERROR(ENOENT);
+        }
+        /*
+         * This inode can have blocks after eof which can get released
+         * when we send it to inactive. Since we don't check the dquot
+         * until the after all our calculations are done, we must get rid
+         * of those now.
+         */
+        if (! ipreleased) {
+                xfs_iput(ip, lock_flags);
+                ipreleased = B_TRUE;
+                goto again;
+        }
+        xfs_qm_internalqcheck_get_dquots(mp,
+                                        (xfs_dqid_t) ip->i_d.di_uid,
+                                        (xfs_dqid_t) ip->i_d.di_gid,
+                                        &ud, &gd);
+        if (XFS_IS_UQUOTA_ON(mp)) {
+                ASSERT(ud);
+                xfs_qm_internalqcheck_dqadjust(ip, ud);
+        }
+        if (XFS_IS_GQUOTA_ON(mp)) {
+                ASSERT(gd);
+                xfs_qm_internalqcheck_dqadjust(ip, gd);
+        }
+        xfs_iput(ip, lock_flags);
+        *res = BULKSTAT_RV_DIDONE;
+        return (0);
+}
+/* PRIVATE, debugging */
+int
+xfs_qm_internalqcheck(
+        xfs_mount_t     *mp)
+{
+        xfs_ino_t       lastino;
+        int             done, count;
+        int             i;
+        xfs_dqtest_t    *d, *e;
+        xfs_dqhash_t    *h1;
+        int             error;
+        lastino = 0;
+        qmtest_hashmask = 32;
+        count = 5;
+        done = 0;
+        qmtest_nfails = 0;
+        if (! XFS_IS_QUOTA_ON(mp))
+                return XFS_ERROR(ESRCH);
+        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+        XFS_bflush(mp->m_ddev_targp);
+        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+        XFS_bflush(mp->m_ddev_targp);
+        mutex_lock(&qcheck_lock, PINOD);
+        /* There should be absolutely no quota activity while this
+           is going on. */
+        qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
+                                    sizeof(xfs_dqhash_t), KM_SLEEP);
+        qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
+                                    sizeof(xfs_dqhash_t), KM_SLEEP);
+        do {
+                /*
+                 * Iterate thru all the inodes in the file system,
+                 * adjusting the corresponding dquot counters
+                 */
+                if ((error = xfs_bulkstat(mp, &lastino, &count,
+                                 xfs_qm_internalqcheck_adjust, NULL,
+                                 0, NULL, BULKSTAT_FG_IGET, &done))) {
+                        break;
+                }
+        } while (! done);
+        if (error) {
+                cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
+        }
+        cmn_err(CE_DEBUG, "Checking results against system dquots");
+        for (i = 0; i < qmtest_hashmask; i++) {
+                h1 = &qmtest_udqtab[i];
+                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                        xfs_dqtest_cmp(d);
+                        e = (xfs_dqtest_t *) d->HL_NEXT;
+                        kmem_free(d, sizeof(xfs_dqtest_t));
+                        d = e;
+                }
+                h1 = &qmtest_gdqtab[i];
+                for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+                        xfs_dqtest_cmp(d);
+                        e = (xfs_dqtest_t *) d->HL_NEXT;
+                        kmem_free(d, sizeof(xfs_dqtest_t));
+                        d = e;
+                }
+        }
+        if (qmtest_nfails) {
+                cmn_err(CE_DEBUG, "******** quotacheck failed  ********");
+                cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
+        } else {
+                cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
+        }
+        kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+        kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+        mutex_unlock(&qcheck_lock);
+        return (qmtest_nfails);
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
new file mode 100644
index 000000000000..414b6004af21
--- /dev/null
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QUOTA_PRIV_H__
+#define __XFS_QUOTA_PRIV_H__
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE     10
+/* Number of dquots that fit in to a dquot block */
+#define XFS_QM_DQPERBLK(mp)     ((mp)->m_quotainfo->qi_dqperchunk)
+#define XFS_ISLOCKED_INODE(ip)          (ismrlocked(&(ip)->i_lock, \
+                                            MR_UPDATE | MR_ACCESS) != 0)
+#define XFS_ISLOCKED_INODE_EXCL(ip)     (ismrlocked(&(ip)->i_lock, \
+                                            MR_UPDATE) != 0)
+#define XFS_DQ_IS_ADDEDTO_TRX(t, d)     ((d)->q_transp == (t))
+#define XFS_QI_MPLRECLAIMS(mp)  ((mp)->m_quotainfo->qi_dqreclaims)
+#define XFS_QI_UQIP(mp)         ((mp)->m_quotainfo->qi_uquotaip)
+#define XFS_QI_GQIP(mp)         ((mp)->m_quotainfo->qi_gquotaip)
+#define XFS_QI_DQCHUNKLEN(mp)   ((mp)->m_quotainfo->qi_dqchunklen)
+#define XFS_QI_BTIMELIMIT(mp)   ((mp)->m_quotainfo->qi_btimelimit)
+#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
+#define XFS_QI_ITIMELIMIT(mp)   ((mp)->m_quotainfo->qi_itimelimit)
+#define XFS_QI_BWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_bwarnlimit)
+#define XFS_QI_IWARNLIMIT(mp)   ((mp)->m_quotainfo->qi_iwarnlimit)
+#define XFS_QI_QOFFLOCK(mp)     ((mp)->m_quotainfo->qi_quotaofflock)
+#define XFS_QI_MPL_LIST(mp)     ((mp)->m_quotainfo->qi_dqlist)
+#define XFS_QI_MPLLOCK(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_lock)
+#define XFS_QI_MPLNEXT(mp)      ((mp)->m_quotainfo->qi_dqlist.qh_next)
+#define XFS_QI_MPLNDQUOTS(mp)   ((mp)->m_quotainfo->qi_dqlist.qh_nelems)
+#define XQMLCK(h)                       (mutex_lock(&((h)->qh_lock), PINOD))
+#define XQMUNLCK(h)                     (mutex_unlock(&((h)->qh_lock)))
+#ifdef DEBUG
+struct xfs_dqhash;
+static inline int XQMISLCKD(struct xfs_dqhash *h)
+{
+        if (mutex_trylock(&h->qh_lock)) {
+                mutex_unlock(&h->qh_lock);
+                return 0;
+        }
+        return 1;
+}
+#endif
+#define XFS_DQ_HASH_LOCK(h)             XQMLCK(h)
+#define XFS_DQ_HASH_UNLOCK(h)           XQMUNLCK(h)
+#define XFS_DQ_IS_HASH_LOCKED(h)        XQMISLCKD(h)
+#define xfs_qm_mplist_lock(mp)          XQMLCK(&(XFS_QI_MPL_LIST(mp)))
+#define xfs_qm_mplist_unlock(mp)        XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
+#define XFS_QM_IS_MPLIST_LOCKED(mp)     XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
+#define xfs_qm_freelist_lock(qm)        XQMLCK(&((qm)->qm_dqfreelist))
+#define xfs_qm_freelist_unlock(qm)      XQMUNLCK(&((qm)->qm_dqfreelist))
+#define XFS_QM_IS_FREELIST_LOCKED(qm)   XQMISLCKD(&((qm)->qm_dqfreelist))
+/*
+ * Hash into a bucket in the dquot hash table, based on <mp, id>.
+ */
+#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+                                 (__psunsigned_t)(id)) & \
+                                (xfs_Gqm->qm_dqhashmask - 1))
+#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
+                                     (xfs_Gqm->qm_usr_dqhtable + \
+                                      XFS_DQ_HASHVAL(mp, id)) : \
+                                     (xfs_Gqm->qm_grp_dqhtable + \
+                                      XFS_DQ_HASHVAL(mp, id)))
+#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
+                                      XFS_IS_UQUOTA_ON(mp):XFS_IS_GQUOTA_ON(mp))
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+        !dqp->q_core.d_blk_hardlimit && \
+        !dqp->q_core.d_blk_softlimit && \
+        !dqp->q_core.d_rtb_hardlimit && \
+        !dqp->q_core.d_rtb_softlimit && \
+        !dqp->q_core.d_ino_hardlimit && \
+        !dqp->q_core.d_ino_softlimit && \
+        !dqp->q_core.d_bcount && \
+        !dqp->q_core.d_rtbcount && \
+        !dqp->q_core.d_icount)
+#define HL_PREVP        dq_hashlist.ql_prevp
+#define HL_NEXT         dq_hashlist.ql_next
+#define MPL_PREVP       dq_mplist.ql_prevp
+#define MPL_NEXT        dq_mplist.ql_next
+#define _LIST_REMOVE(h, dqp, PVP, NXT)                          \
+        {                                                       \
+                 xfs_dquot_t *d;                                \
+                 if (((d) = (dqp)->NXT))                                \
+                         (d)->PVP = (dqp)->PVP;                 \
+                 *((dqp)->PVP) = d;                             \
+                 (dqp)->NXT = NULL;                             \
+                 (dqp)->PVP = NULL;                             \
+                 (h)->qh_version++;                             \
+                 (h)->qh_nelems--;                              \
+        }
+#define _LIST_INSERT(h, dqp, PVP, NXT)                          \
+        {                                                       \
+                 xfs_dquot_t *d;                                \
+                 if (((d) = (h)->qh_next))                      \
+                         (d)->PVP = &((dqp)->NXT);              \
+                 (dqp)->NXT = d;                                \
+                 (dqp)->PVP = &((h)->qh_next);                  \
+                 (h)->qh_next = dqp;                            \
+                 (h)->qh_version++;                             \
+                 (h)->qh_nelems++;                              \
+         }
+#define FOREACH_DQUOT_IN_MP(dqp, mp) \
+        for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
+#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)   \
+for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
+     (dqp) = (dqp)->dq_flnext)
+#define XQM_HASHLIST_INSERT(h, dqp)     \
+         _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
+#define XQM_FREELIST_INSERT(h, dqp)     \
+         xfs_qm_freelist_append(h, dqp)
+#define XQM_MPLIST_INSERT(h, dqp)       \
+         _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
+#define XQM_HASHLIST_REMOVE(h, dqp)     \
+         _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
+#define XQM_FREELIST_REMOVE(dqp)        \
+         xfs_qm_freelist_unlink(dqp)
+#define XQM_MPLIST_REMOVE(h, dqp)       \
+        { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
+          XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
+#define XFS_DQ_IS_LOGITEM_INITD(dqp)    ((dqp)->q_logitem.qli_dquot == (dqp))
+#define XFS_QM_DQP_TO_DQACCT(tp, dqp)   (XFS_QM_ISUDQ(dqp) ? \
+                                         (tp)->t_dqinfo->dqa_usrdquots : \
+                                         (tp)->t_dqinfo->dqa_grpdquots)
+#define XFS_IS_SUSER_DQUOT(dqp)         \
+        (!((dqp)->q_core.d_id))
+#define XFS_PURGE_INODE(ip)             \
+        {                               \
+          vmap_t dqvmap;                \
+          vnode_t *dqvp;                \
+          dqvp = XFS_ITOV(ip);          \
+          VMAP(dqvp, dqvmap);           \
+          VN_RELE(dqvp);                \
+        }
+#define DQFLAGTO_TYPESTR(d)     (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
+                                 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : "???"))
+#define DQFLAGTO_DIRTYSTR(d)    (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
+#endif  /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
new file mode 100644
index 000000000000..149b2a1fd949
--- /dev/null
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -0,0 +1,941 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+STATIC void     xfs_trans_alloc_dqinfo(xfs_trans_t *);
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+        xfs_trans_t     *tp,
+        xfs_dquot_t     *dqp)
+{
+        xfs_dq_logitem_t    *lp;
+        ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
+        lp = &dqp->q_logitem;
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
+        /*
+         * Initialize i_transp so we can later determine if this dquot is
+         * associated with this transaction.
+         */
+        dqp->q_transp = tp;
+}
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed.  The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+        xfs_trans_t     *tp,
+        xfs_dquot_t     *dqp)
+{
+        xfs_log_item_desc_t     *lidp;
+        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
+        ASSERT(lidp != NULL);
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        lidp->lid_flags |= XFS_LID_DIRTY;
+}
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+STATIC void
+xfs_trans_dup_dqinfo(
+        xfs_trans_t     *otp,
+        xfs_trans_t     *ntp)
+{
+        xfs_dqtrx_t     *oq, *nq;
+        int             i,j;
+        xfs_dqtrx_t     *oqa, *nqa;
+        if (!otp->t_dqinfo)
+                return;
+        xfs_trans_alloc_dqinfo(ntp);
+        oqa = otp->t_dqinfo->dqa_usrdquots;
+        nqa = ntp->t_dqinfo->dqa_usrdquots;
+        /*
+         * Because the quota blk reservation is carried forward,
+         * it is also necessary to carry forward the DQ_DIRTY flag.
+         */
+        if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+                ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
+        for (j = 0; j < 2; j++) {
+                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                        if (oqa[i].qt_dquot == NULL)
+                                break;
+                        oq = &oqa[i];
+                        nq = &nqa[i];
+                        nq->qt_dquot = oq->qt_dquot;
+                        nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+                        nq->qt_rtbcount_delta = 0;
+                        /*
+                         * Transfer whatever is left of the reservations.
+                         */
+                        nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
+                        oq->qt_blk_res = oq->qt_blk_res_used;
+                        nq->qt_rtblk_res = oq->qt_rtblk_res -
+                                oq->qt_rtblk_res_used;
+                        oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+                        nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+                        oq->qt_ino_res = oq->qt_ino_res_used;
+                }
+                oqa = otp->t_dqinfo->dqa_grpdquots;
+                nqa = ntp->t_dqinfo->dqa_grpdquots;
+        }
+}
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        uint            field,
+        long            delta)
+{
+        xfs_mount_t     *mp;
+        ASSERT(tp);
+        mp = tp->t_mountp;
+        if (!XFS_IS_QUOTA_ON(mp) ||
+            ip->i_ino == mp->m_sb.sb_uquotino ||
+            ip->i_ino == mp->m_sb.sb_gquotino)
+                return;
+        if (tp->t_dqinfo == NULL)
+                xfs_trans_alloc_dqinfo(tp);
+        if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) {
+                (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+        }
+        if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) {
+                (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+        }
+}
+STATIC xfs_dqtrx_t *
+xfs_trans_get_dqtrx(
+        xfs_trans_t     *tp,
+        xfs_dquot_t     *dqp)
+{
+        int             i;
+        xfs_dqtrx_t     *qa;
+        for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+                if (qa[i].qt_dquot == NULL ||
+                    qa[i].qt_dquot == dqp) {
+                        return (&qa[i]);
+                }
+        }
+        return (NULL);
+}
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+        xfs_trans_t     *tp,
+        xfs_dquot_t     *dqp,
+        uint            field,
+        long            delta)
+{
+        xfs_dqtrx_t     *qtrx;
+        ASSERT(tp);
+        qtrx = NULL;
+        if (tp->t_dqinfo == NULL)
+                xfs_trans_alloc_dqinfo(tp);
+        /*
+         * Find either the first free slot or the slot that belongs
+         * to this dquot.
+         */
+        qtrx = xfs_trans_get_dqtrx(tp, dqp);
+        ASSERT(qtrx);
+        if (qtrx->qt_dquot == NULL)
+                qtrx->qt_dquot = dqp;
+        switch (field) {
+                /*
+                 * regular disk blk reservation
+                 */
+              case XFS_TRANS_DQ_RES_BLKS:
+                qtrx->qt_blk_res += (ulong)delta;
+                break;
+                /*
+                 * inode reservation
+                 */
+              case XFS_TRANS_DQ_RES_INOS:
+                qtrx->qt_ino_res += (ulong)delta;
+                break;
+                /*
+                 * disk blocks used.
+                 */
+              case XFS_TRANS_DQ_BCOUNT:
+                if (qtrx->qt_blk_res && delta > 0) {
+                        qtrx->qt_blk_res_used += (ulong)delta;
+                        ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
+                }
+                qtrx->qt_bcount_delta += delta;
+                break;
+              case XFS_TRANS_DQ_DELBCOUNT:
+                qtrx->qt_delbcnt_delta += delta;
+                break;
+                /*
+                 * Inode Count
+                 */
+              case XFS_TRANS_DQ_ICOUNT:
+                if (qtrx->qt_ino_res && delta > 0) {
+                        qtrx->qt_ino_res_used += (ulong)delta;
+                        ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+                }
+                qtrx->qt_icount_delta += delta;
+                break;
+                /*
+                 * rtblk reservation
+                 */
+              case XFS_TRANS_DQ_RES_RTBLKS:
+                qtrx->qt_rtblk_res += (ulong)delta;
+                break;
+                /*
+                 * rtblk count
+                 */
+              case XFS_TRANS_DQ_RTBCOUNT:
+                if (qtrx->qt_rtblk_res && delta > 0) {
+                        qtrx->qt_rtblk_res_used += (ulong)delta;
+                        ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+                }
+                qtrx->qt_rtbcount_delta += delta;
+                break;
+              case XFS_TRANS_DQ_DELRTBCOUNT:
+                qtrx->qt_delrtb_delta += delta;
+                break;
+              default:
+                ASSERT(0);
+        }
+        tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+}
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated
+ * and join them to the transaction, provided they have been modified.
+ * We know that the highest number of dquots (of one type - usr OR grp),
+ * involved in a transaction is 2 and that both usr and grp combined - 3.
+ * So, we don't attempt to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+        xfs_trans_t     *tp,
+        xfs_dqtrx_t     *q)
+{
+        ASSERT(q[0].qt_dquot != NULL);
+        if (q[1].qt_dquot == NULL) {
+                xfs_dqlock(q[0].qt_dquot);
+                xfs_trans_dqjoin(tp, q[0].qt_dquot);
+        } else {
+                ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+                xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+                xfs_trans_dqjoin(tp, q[0].qt_dquot);
+                xfs_trans_dqjoin(tp, q[1].qt_dquot);
+        }
+}
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction.
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+        xfs_trans_t             *tp)
+{
+        int                     i, j;
+        xfs_dquot_t             *dqp;
+        xfs_dqtrx_t             *qtrx, *qa;
+        xfs_disk_dquot_t        *d;
+        long                    totalbdelta;
+        long                    totalrtbdelta;
+        if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY))
+                return;
+        ASSERT(tp->t_dqinfo);
+        qa = tp->t_dqinfo->dqa_usrdquots;
+        for (j = 0; j < 2; j++) {
+                if (qa[0].qt_dquot == NULL) {
+                        qa = tp->t_dqinfo->dqa_grpdquots;
+                        continue;
+                }
+                /*
+                 * Lock all of the dquots and join them to the transaction.
+                 */
+                xfs_trans_dqlockedjoin(tp, qa);
+                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                        qtrx = &qa[i];
+                        /*
+                         * The array of dquots is filled
+                         * sequentially, not sparsely.
+                         */
+                        if ((dqp = qtrx->qt_dquot) == NULL)
+                                break;
+                        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+                        ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+                        /*
+                         * adjust the actual number of blocks used
+                         */
+                        d = &dqp->q_core;
+                        /*
+                         * The issue here is - sometimes we don't make a blkquota
+                         * reservation intentionally to be fair to users
+                         * (when the amount is small). On the other hand,
+                         * delayed allocs do make reservations, but that's
+                         * outside of a transaction, so we have no
+                         * idea how much was really reserved.
+                         * So, here we've accumulated delayed allocation blks and
+                         * non-delay blks. The assumption is that the
+                         * delayed ones are always reserved (outside of a
+                         * transaction), and the others may or may not have
+                         * quota reservations.
+                         */
+                        totalbdelta = qtrx->qt_bcount_delta +
+                                qtrx->qt_delbcnt_delta;
+                        totalrtbdelta = qtrx->qt_rtbcount_delta +
+                                qtrx->qt_delrtb_delta;
+#ifdef QUOTADEBUG
+                        if (totalbdelta < 0)
+                                ASSERT(INT_GET(d->d_bcount, ARCH_CONVERT) >=
+                                       (xfs_qcnt_t) -totalbdelta);
+                        if (totalrtbdelta < 0)
+                                ASSERT(INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
+                                       (xfs_qcnt_t) -totalrtbdelta);
+                        if (qtrx->qt_icount_delta < 0)
+                                ASSERT(INT_GET(d->d_icount, ARCH_CONVERT) >=
+                                       (xfs_qcnt_t) -qtrx->qt_icount_delta);
+#endif
+                        if (totalbdelta)
+                                INT_MOD(d->d_bcount, ARCH_CONVERT, (xfs_qcnt_t)totalbdelta);
+                        if (qtrx->qt_icount_delta)
+                                INT_MOD(d->d_icount, ARCH_CONVERT, (xfs_qcnt_t)qtrx->qt_icount_delta);
+                        if (totalrtbdelta)
+                                INT_MOD(d->d_rtbcount, ARCH_CONVERT, (xfs_qcnt_t)totalrtbdelta);
+                        /*
+                         * Get any default limits in use.
+                         * Start/reset the timer(s) if needed.
+                         */
+                        if (d->d_id) {
+                                xfs_qm_adjust_dqlimits(tp->t_mountp, d);
+                                xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+                        }
+                        dqp->dq_flags |= XFS_DQ_DIRTY;
+                        /*
+                         * add this to the list of items to get logged
+                         */
+                        xfs_trans_log_dquot(tp, dqp);
+                        /*
+                         * Take off what's left of the original reservation.
+                         * In case of delayed allocations, there's no
+                         * reservation that a transaction structure knows of.
+                         */
+                        if (qtrx->qt_blk_res != 0) {
+                                if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+                                        if (qtrx->qt_blk_res >
+                                            qtrx->qt_blk_res_used)
+                                                dqp->q_res_bcount -= (xfs_qcnt_t)
+                                                        (qtrx->qt_blk_res -
+                                                         qtrx->qt_blk_res_used);
+                                        else
+                                                dqp->q_res_bcount -= (xfs_qcnt_t)
+                                                        (qtrx->qt_blk_res_used -
+                                                         qtrx->qt_blk_res);
+                                }
+                        } else {
+                                /*
+                                 * These blks were never reserved, either inside
+                                 * a transaction or outside one (in a delayed
+                                 * allocation). Also, this isn't always a
+                                 * negative number since we sometimes
+                                 * deliberately skip quota reservations.
+                                 */
+                                if (qtrx->qt_bcount_delta) {
+                                        dqp->q_res_bcount +=
+                                              (xfs_qcnt_t)qtrx->qt_bcount_delta;
+                                }
+                        }
+                        /*
+                         * Adjust the RT reservation.
+                         */
+                        if (qtrx->qt_rtblk_res != 0) {
+                                if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+                                        if (qtrx->qt_rtblk_res >
+                                            qtrx->qt_rtblk_res_used)
+                                               dqp->q_res_rtbcount -= (xfs_qcnt_t)
+                                                       (qtrx->qt_rtblk_res -
+                                                        qtrx->qt_rtblk_res_used);
+                                        else
+                                               dqp->q_res_rtbcount -= (xfs_qcnt_t)
+                                                       (qtrx->qt_rtblk_res_used -
+                                                        qtrx->qt_rtblk_res);
+                                }
+                        } else {
+                                if (qtrx->qt_rtbcount_delta)
+                                        dqp->q_res_rtbcount +=
+                                            (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
+                        }
+                        /*
+                         * Adjust the inode reservation.
+                         */
+                        if (qtrx->qt_ino_res != 0) {
+                                ASSERT(qtrx->qt_ino_res >=
+                                       qtrx->qt_ino_res_used);
+                                if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
+                                        dqp->q_res_icount -= (xfs_qcnt_t)
+                                                (qtrx->qt_ino_res -
+                                                 qtrx->qt_ino_res_used);
+                        } else {
+                                if (qtrx->qt_icount_delta)
+                                        dqp->q_res_icount +=
+                                            (xfs_qcnt_t)qtrx->qt_icount_delta;
+                        }
+#ifdef QUOTADEBUG
+                        if (qtrx->qt_rtblk_res != 0)
+                                cmn_err(CE_DEBUG, "RT res %d for 0x%p\n",
+                                        (int) qtrx->qt_rtblk_res, dqp);
+#endif
+                        ASSERT(dqp->q_res_bcount >=
+                                INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+                        ASSERT(dqp->q_res_icount >=
+                                INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+                        ASSERT(dqp->q_res_rtbcount >=
+                                INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
+                }
+                /*
+                 * Do the group quotas next
+                 */
+                qa = tp->t_dqinfo->dqa_grpdquots;
+        }
+}
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+STATIC void
+xfs_trans_unreserve_and_mod_dquots(
+        xfs_trans_t             *tp)
+{
+        int                     i, j;
+        xfs_dquot_t             *dqp;
+        xfs_dqtrx_t             *qtrx, *qa;
+        boolean_t               locked;
+        if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+                return;
+        qa = tp->t_dqinfo->dqa_usrdquots;
+        for (j = 0; j < 2; j++) {
+                for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                        qtrx = &qa[i];
+                        /*
+                         * We assume that the array of dquots is filled
+                         * sequentially, not sparsely.
+                         */
+                        if ((dqp = qtrx->qt_dquot) == NULL)
+                                break;
+                        /*
+                         * Unreserve the original reservation. We don't care
+                         * about the number of blocks used field, or deltas.
+                         * Also we don't bother to zero the fields.
+                         */
+                        locked = B_FALSE;
+                        if (qtrx->qt_blk_res) {
+                                xfs_dqlock(dqp);
+                                locked = B_TRUE;
+                                dqp->q_res_bcount -=
+                                        (xfs_qcnt_t)qtrx->qt_blk_res;
+                        }
+                        if (qtrx->qt_ino_res) {
+                                if (!locked) {
+                                        xfs_dqlock(dqp);
+                                        locked = B_TRUE;
+                                }
+                                dqp->q_res_icount -=
+                                        (xfs_qcnt_t)qtrx->qt_ino_res;
+                        }
+                        if (qtrx->qt_rtblk_res) {
+                                if (!locked) {
+                                        xfs_dqlock(dqp);
+                                        locked = B_TRUE;
+                                }
+                                dqp->q_res_rtbcount -=
+                                        (xfs_qcnt_t)qtrx->qt_rtblk_res;
+                        }
+                        if (locked)
+                                xfs_dqunlock(dqp);
+                }
+                qa = tp->t_dqinfo->dqa_grpdquots;
+        }
+}
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ * Returns EDQUOT if quota is exceeded.
+ */
+STATIC int
+xfs_trans_dqresv(
+        xfs_trans_t     *tp,
+        xfs_mount_t     *mp,
+        xfs_dquot_t     *dqp,
+        long            nblks,
+        long            ninos,
+        uint            flags)
+{
+        int             error;
+        xfs_qcnt_t      hardlimit;
+        xfs_qcnt_t      softlimit;
+        time_t          btimer;
+        xfs_qcnt_t      *resbcountp;
+        xfs_quotainfo_t *q = mp->m_quotainfo;
+        if (! (flags & XFS_QMOPT_DQLOCK)) {
+                xfs_dqlock(dqp);
+        }
+        ASSERT(XFS_DQ_IS_LOCKED(dqp));
+        if (flags & XFS_TRANS_DQ_RES_BLKS) {
+                hardlimit = INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT);
+                if (!hardlimit)
+                        hardlimit = q->qi_bhardlimit;
+                softlimit = INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT);
+                if (!softlimit)
+                        softlimit = q->qi_bsoftlimit;
+                btimer = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT);
+                resbcountp = &dqp->q_res_bcount;
+        } else {
+                ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
+                hardlimit = INT_GET(dqp->q_core.d_rtb_hardlimit, ARCH_CONVERT);
+                if (!hardlimit)
+                        hardlimit = q->qi_rtbhardlimit;
+                softlimit = INT_GET(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT);
+                if (!softlimit)
+                        softlimit = q->qi_rtbsoftlimit;
+                btimer = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT);
+                resbcountp = &dqp->q_res_rtbcount;
+        }
+        error = 0;
+        if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
+            dqp->q_core.d_id &&
+            XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) {
+#ifdef QUOTADEBUG
+                cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
+                          " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
+#endif
+                if (nblks > 0) {
+                        /*
+                         * dquot is locked already. See if we'd go over the
+                         * hardlimit or exceed the timelimit if we allocate
+                         * nblks.
+                         */
+                        if (hardlimit > 0ULL &&
+                             (hardlimit <= nblks + *resbcountp)) {
+                                error = EDQUOT;
+                                goto error_return;
+                        }
+                        if (softlimit > 0ULL &&
+                             (softlimit <= nblks + *resbcountp)) {
+                                /*
+                                 * If timer or warnings has expired,
+                                 * return EDQUOT
+                                 */
+                                if ((btimer != 0 && get_seconds() > btimer) ||
+                                    (dqp->q_core.d_bwarns &&
+                                     INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) >=
+                                     XFS_QI_BWARNLIMIT(dqp->q_mount))) {
+                                        error = EDQUOT;
+                                        goto error_return;
+                                }
+                        }
+                }
+                if (ninos > 0) {
+                        hardlimit = INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT);
+                        if (!hardlimit)
+                                hardlimit = q->qi_ihardlimit;
+                        softlimit = INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT);
+                        if (!softlimit)
+                                softlimit = q->qi_isoftlimit;
+                        if (hardlimit > 0ULL &&
+                            INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= hardlimit) {
+                                error = EDQUOT;
+                                goto error_return;
+                        } else if (softlimit > 0ULL &&
+                                   INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= softlimit) {
+                                /*
+                                 * If timer or warnings has expired,
+                                 * return EDQUOT
+                                 */
+                                if ((dqp->q_core.d_itimer &&
+                                     get_seconds() > INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT)) ||
+                                    (dqp->q_core.d_iwarns &&
+                                     INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) >=
+                                     XFS_QI_IWARNLIMIT(dqp->q_mount))) {
+                                        error = EDQUOT;
+                                        goto error_return;
+                                }
+                        }
+                }
+        }
+        /*
+         * Change the reservation, but not the actual usage.
+         * Note that q_res_bcount = q_core.d_bcount + resv
+         */
+        (*resbcountp) += (xfs_qcnt_t)nblks;
+        if (ninos != 0)
+                dqp->q_res_icount += (xfs_qcnt_t)ninos;
+        /*
+         * note the reservation amt in the trans struct too,
+         * so that the transaction knows how much was reserved by
+         * it against this particular dquot.
+         * We don't do this when we are reserving for a delayed allocation,
+         * because we don't have the luxury of a transaction envelope then.
+         */
+        if (tp) {
+                ASSERT(tp->t_dqinfo);
+                ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+                if (nblks != 0)
+                        xfs_trans_mod_dquot(tp, dqp,
+                                            flags & XFS_QMOPT_RESBLK_MASK,
+                                            nblks);
+                if (ninos != 0)
+                        xfs_trans_mod_dquot(tp, dqp,
+                                            XFS_TRANS_DQ_RES_INOS,
+                                            ninos);
+        }
+        ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+        ASSERT(dqp->q_res_rtbcount >= INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
+        ASSERT(dqp->q_res_icount >= INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+error_return:
+        if (! (flags & XFS_QMOPT_DQLOCK)) {
+                xfs_dqunlock(dqp);
+        }
+        return (error);
+}
+/*
+ * Given a dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against both the usr and
+ * grp quotas is important, because this follows a both-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
+ *         XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ *         XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ *         XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+        xfs_trans_t     *tp,
+        xfs_mount_t     *mp,
+        xfs_dquot_t     *udqp,
+        xfs_dquot_t     *gdqp,
+        long            nblks,
+        long            ninos,
+        uint            flags)
+{
+        int             resvd;
+        if (! XFS_IS_QUOTA_ON(mp))
+                return (0);
+        if (tp && tp->t_dqinfo == NULL)
+                xfs_trans_alloc_dqinfo(tp);
+        ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+        resvd = 0;
+        if (udqp) {
+                if (xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags))
+                        return (EDQUOT);
+                resvd = 1;
+        }
+        if (gdqp) {
+                if (xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags)) {
+                        /*
+                         * can't do it, so backout previous reservation
+                         */
+                        if (resvd) {
+                                flags |= XFS_QMOPT_FORCE_RES;
+                                xfs_trans_dqresv(tp, mp, udqp,
+                                                 -nblks, -ninos, flags);
+                        }
+                        return (EDQUOT);
+                }
+        }
+        /*
+         * Didnt change anything critical, so, no need to log
+         */
+        return (0);
+}
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesn't change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ *
+ * Returns 0 on success, EDQUOT or other errors otherwise
+ */
+STATIC int
+xfs_trans_reserve_quota_nblks(
+        xfs_trans_t     *tp,
+        xfs_mount_t     *mp,
+        xfs_inode_t     *ip,
+        long            nblks,
+        long            ninos,
+        uint            type)
+{
+        int             error;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return (0);
+        ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
+        ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+        ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+        ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+        ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS ||
+               (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS);
+        /*
+         * Reserve nblks against these dquots, with trans as the mediator.
+         */
+        error = xfs_trans_reserve_quota_bydquots(tp, mp,
+                                                 ip->i_udquot, ip->i_gdquot,
+                                                 nblks, ninos,
+                                                 type);
+        return (error);
+}
+/*
+ * This routine is called to allocate a quotaoff log item.
+ */
+xfs_qoff_logitem_t *
+xfs_trans_get_qoff_item(
+        xfs_trans_t             *tp,
+        xfs_qoff_logitem_t      *startqoff,
+        uint                    flags)
+{
+        xfs_qoff_logitem_t      *q;
+        ASSERT(tp != NULL);
+        q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
+        ASSERT(q != NULL);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
+        return (q);
+}
+/*
+ * This is called to mark the quotaoff logitem as needing
+ * to be logged when the transaction is committed.  The logitem must
+ * already be associated with the given transaction.
+ */
+void
+xfs_trans_log_quotaoff_item(
+        xfs_trans_t             *tp,
+        xfs_qoff_logitem_t      *qlp)
+{
+        xfs_log_item_desc_t     *lidp;
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
+        ASSERT(lidp != NULL);
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        lidp->lid_flags |= XFS_LID_DIRTY;
+}
+STATIC void
+xfs_trans_alloc_dqinfo(
+        xfs_trans_t     *tp)
+{
+        (tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+}
+STATIC void
+xfs_trans_free_dqinfo(
+        xfs_trans_t     *tp)
+{
+        if (!tp->t_dqinfo)
+                return;
+        kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo);
+        (tp)->t_dqinfo = NULL;
+}
+xfs_dqtrxops_t  xfs_trans_dquot_ops = {
+        .qo_dup_dqinfo                  = xfs_trans_dup_dqinfo,
+        .qo_free_dqinfo                 = xfs_trans_free_dqinfo,
+        .qo_mod_dquot_byino             = xfs_trans_mod_dquot_byino,
+        .qo_apply_dquot_deltas          = xfs_trans_apply_dquot_deltas,
+        .qo_reserve_quota_nblks         = xfs_trans_reserve_quota_nblks,
+        .qo_reserve_quota_bydquots      = xfs_trans_reserve_quota_bydquots,
+        .qo_unreserve_and_mod_dquots    = xfs_trans_unreserve_and_mod_dquots,
+};
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
new file mode 100644
index 000000000000..7d6e1f37df10
--- /dev/null
+++ b/fs/xfs/support/debug.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "debug.h"
+#include <asm/page.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+int                     doass = 1;
+static char             message[256];   /* keep it off the stack */
+static DEFINE_SPINLOCK(xfs_err_lock);
+/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */
+#define XFS_MAX_ERR_LEVEL       7
+#define XFS_ERR_MASK            ((1 << 3) - 1)
+static char             *err_level[XFS_MAX_ERR_LEVEL+1] =
+                                        {KERN_EMERG, KERN_ALERT, KERN_CRIT,
+                                         KERN_ERR, KERN_WARNING, KERN_NOTICE,
+                                         KERN_INFO, KERN_DEBUG};
+void
+assfail(char *a, char *f, int l)
+{
+    printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l);
+    BUG();
+}
+#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM))
+unsigned long
+random(void)
+{
+        static unsigned long    RandomValue = 1;
+        /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
+        register long   rv = RandomValue;
+        register long   lo;
+        register long   hi;
+        hi = rv / 127773;
+        lo = rv % 127773;
+        rv = 16807 * lo - 2836 * hi;
+        if( rv <= 0 ) rv += 2147483647;
+        return( RandomValue = rv );
+}
+int
+get_thread_id(void)
+{
+        return current->pid;
+}
+#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */
+void
+cmn_err(register int level, char *fmt, ...)
+{
+        char    *fp = fmt;
+        int     len;
+        ulong   flags;
+        va_list ap;
+        level &= XFS_ERR_MASK;
+        if (level > XFS_MAX_ERR_LEVEL)
+                level = XFS_MAX_ERR_LEVEL;
+        spin_lock_irqsave(&xfs_err_lock,flags);
+        va_start(ap, fmt);
+        if (*fmt == '!') fp++;
+        len = vsprintf(message, fp, ap);
+        if (message[len-1] != '\n')
+                strcat(message, "\n");
+        printk("%s%s", err_level[level], message);
+        va_end(ap);
+        spin_unlock_irqrestore(&xfs_err_lock,flags);
+        if (level == CE_PANIC)
+                BUG();
+}
+void
+icmn_err(register int level, char *fmt, va_list ap)
+{
+        ulong   flags;
+        int     len;
+        level &= XFS_ERR_MASK;
+        if(level > XFS_MAX_ERR_LEVEL)
+                level = XFS_MAX_ERR_LEVEL;
+        spin_lock_irqsave(&xfs_err_lock,flags);
+        len = vsprintf(message, fmt, ap);
+        if (message[len-1] != '\n')
+                strcat(message, "\n");
+        spin_unlock_irqrestore(&xfs_err_lock,flags);
+        printk("%s%s", err_level[level], message);
+        if (level == CE_PANIC)
+                BUG();
+}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
new file mode 100644
index 000000000000..40b0f4c54d9e
--- /dev/null
+++ b/fs/xfs/support/debug.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_DEBUG_H__
+#define __XFS_SUPPORT_DEBUG_H__
+#include <stdarg.h>
+#define CE_DEBUG        7               /* debug        */
+#define CE_CONT         6               /* continuation */
+#define CE_NOTE         5               /* notice       */
+#define CE_WARN         4               /* warning      */
+#define CE_ALERT        1               /* alert        */
+#define CE_PANIC        0               /* panic        */
+extern void icmn_err(int, char *, va_list);
+/* PRINTFLIKE2 */
+extern void cmn_err(int, char *, ...);
+#ifndef STATIC
+# define STATIC static
+#endif
+#ifdef DEBUG
+# ifdef lint
+#  define ASSERT(EX)    ((void)0) /* avoid "constant in conditional" babble */
+# else
+#  define ASSERT(EX) ((!doass||(EX))?((void)0):assfail(#EX, __FILE__, __LINE__))
+# endif /* lint */
+#else
+# define ASSERT(x)      ((void)0)
+#endif
+extern int doass;               /* dynamically turn off asserts */
+extern void assfail(char *, char *, int);
+#ifdef DEBUG
+extern unsigned long random(void);
+extern int get_thread_id(void);
+#endif
+#define ASSERT_ALWAYS(EX)  ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__))
+#define debug_stop_all_cpus(param)      /* param is "cpumask_t *" */
+#endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
new file mode 100644
index 000000000000..3dae14c8c55a
--- /dev/null
+++ b/fs/xfs/support/ktrace.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include <xfs.h>
+static kmem_zone_t *ktrace_hdr_zone;
+static kmem_zone_t *ktrace_ent_zone;
+static int          ktrace_zentries;
+void
+ktrace_init(int zentries)
+{
+        ktrace_zentries = zentries;
+        ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
+                                        "ktrace_hdr");
+        ASSERT(ktrace_hdr_zone);
+        ktrace_ent_zone = kmem_zone_init(ktrace_zentries
+                                        * sizeof(ktrace_entry_t),
+                                        "ktrace_ent");
+        ASSERT(ktrace_ent_zone);
+}
+void
+ktrace_uninit(void)
+{
+        kmem_cache_destroy(ktrace_hdr_zone);
+        kmem_cache_destroy(ktrace_ent_zone);
+}
+/*
+ * ktrace_alloc()
+ *
+ * Allocate a ktrace header and enough buffering for the given
+ * number of entries.
+ */
+ktrace_t *
+ktrace_alloc(int nentries, int sleep)
+{
+        ktrace_t        *ktp;
+        ktrace_entry_t  *ktep;
+        ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
+        if (ktp == (ktrace_t*)NULL) {
+                /*
+                 * KM_SLEEP callers don't expect failure.
+                 */
+                if (sleep & KM_SLEEP)
+                        panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+                return NULL;
+        }
+        /*
+         * Special treatment for buffers with the ktrace_zentries entries
+         */
+        if (nentries == ktrace_zentries) {
+                ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
+                                                            sleep);
+        } else {
+                ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+                                                            sleep);
+        }
+        if (ktep == NULL) {
+                /*
+                 * KM_SLEEP callers don't expect failure.
+                 */
+                if (sleep & KM_SLEEP)
+                        panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+                kmem_free(ktp, sizeof(*ktp));
+                return NULL;
+        }
+        spinlock_init(&(ktp->kt_lock), "kt_lock");
+        ktp->kt_entries  = ktep;
+        ktp->kt_nentries = nentries;
+        ktp->kt_index    = 0;
+        ktp->kt_rollover = 0;
+        return ktp;
+}
+/*
+ * ktrace_free()
+ *
+ * Free up the ktrace header and buffer.  It is up to the caller
+ * to ensure that no-one is referencing it.
+ */
+void
+ktrace_free(ktrace_t *ktp)
+{
+        int     entries_size;
+        if (ktp == (ktrace_t *)NULL)
+                return;
+        spinlock_destroy(&ktp->kt_lock);
+        /*
+         * Special treatment for the Vnode trace buffer.
+         */
+        if (ktp->kt_nentries == ktrace_zentries) {
+                kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
+        } else {
+                entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
+                kmem_free(ktp->kt_entries, entries_size);
+        }
+        kmem_zone_free(ktrace_hdr_zone, ktp);
+}
+/*
+ * Enter the given values into the "next" entry in the trace buffer.
+ * kt_index is always the index of the next entry to be filled.
+ */
+void
+ktrace_enter(
+        ktrace_t        *ktp,
+        void            *val0,
+        void            *val1,
+        void            *val2,
+        void            *val3,
+        void            *val4,
+        void            *val5,
+        void            *val6,
+        void            *val7,
+        void            *val8,
+        void            *val9,
+        void            *val10,
+        void            *val11,
+        void            *val12,
+        void            *val13,
+        void            *val14,
+        void            *val15)
+{
+        static lock_t   wrap_lock = SPIN_LOCK_UNLOCKED;
+        unsigned long   flags;
+        int             index;
+        ktrace_entry_t  *ktep;
+        ASSERT(ktp != NULL);
+        /*
+         * Grab an entry by pushing the index up to the next one.
+         */
+        spin_lock_irqsave(&wrap_lock, flags);
+        index = ktp->kt_index;
+        if (++ktp->kt_index == ktp->kt_nentries)
+                ktp->kt_index = 0;
+        spin_unlock_irqrestore(&wrap_lock, flags);
+        if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
+                ktp->kt_rollover = 1;
+        ASSERT((index >= 0) && (index < ktp->kt_nentries));
+        ktep = &(ktp->kt_entries[index]);
+        ktep->val[0]  = val0;
+        ktep->val[1]  = val1;
+        ktep->val[2]  = val2;
+        ktep->val[3]  = val3;
+        ktep->val[4]  = val4;
+        ktep->val[5]  = val5;
+        ktep->val[6]  = val6;
+        ktep->val[7]  = val7;
+        ktep->val[8]  = val8;
+        ktep->val[9]  = val9;
+        ktep->val[10] = val10;
+        ktep->val[11] = val11;
+        ktep->val[12] = val12;
+        ktep->val[13] = val13;
+        ktep->val[14] = val14;
+        ktep->val[15] = val15;
+}
+/*
+ * Return the number of entries in the trace buffer.
+ */
+int
+ktrace_nentries(
+        ktrace_t        *ktp)
+{
+        if (ktp == NULL) {
+                return 0;
+        }
+        return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+}
+/*
+ * ktrace_first()
+ *
+ * This is used to find the start of the trace buffer.
+ * In conjunction with ktrace_next() it can be used to
+ * iterate through the entire trace buffer.  This code does
+ * not do any locking because it is assumed that it is called
+ * from the debugger.
+ *
+ * The caller must pass in a pointer to a ktrace_snap
+ * structure in which we will keep some state used to
+ * iterate through the buffer.  This state must not touched
+ * by any code outside of this module.
+ */
+ktrace_entry_t *
+ktrace_first(ktrace_t   *ktp, ktrace_snap_t     *ktsp)
+{
+        ktrace_entry_t  *ktep;
+        int             index;
+        int             nentries;
+        if (ktp->kt_rollover)
+                index = ktp->kt_index;
+        else
+                index = 0;
+        ktsp->ks_start = index;
+        ktep = &(ktp->kt_entries[index]);
+        nentries = ktrace_nentries(ktp);
+        index++;
+        if (index < nentries) {
+                ktsp->ks_index = index;
+        } else {
+                ktsp->ks_index = 0;
+                if (index > nentries)
+                        ktep = NULL;
+        }
+        return ktep;
+}
+/*
+ * ktrace_next()
+ *
+ * This is used to iterate through the entries of the given
+ * trace buffer.  The caller must pass in the ktrace_snap_t
+ * structure initialized by ktrace_first().  The return value
+ * will be either a pointer to the next ktrace_entry or NULL
+ * if all of the entries have been traversed.
+ */
+ktrace_entry_t *
+ktrace_next(
+        ktrace_t        *ktp,
+        ktrace_snap_t   *ktsp)
+{
+        int             index;
+        ktrace_entry_t  *ktep;
+        index = ktsp->ks_index;
+        if (index == ktsp->ks_start) {
+                ktep = NULL;
+        } else {
+                ktep = &ktp->kt_entries[index];
+        }
+        index++;
+        if (index == ktrace_nentries(ktp)) {
+                ktsp->ks_index = 0;
+        } else {
+                ktsp->ks_index = index;
+        }
+        return ktep;
+}
+/*
+ * ktrace_skip()
+ *
+ * Skip the next "count" entries and return the entry after that.
+ * Return NULL if this causes us to iterate past the beginning again.
+ */
+ktrace_entry_t *
+ktrace_skip(
+        ktrace_t        *ktp,
+        int             count,
+        ktrace_snap_t   *ktsp)
+{
+        int             index;
+        int             new_index;
+        ktrace_entry_t  *ktep;
+        int             nentries = ktrace_nentries(ktp);
+        index = ktsp->ks_index;
+        new_index = index + count;
+        while (new_index >= nentries) {
+                new_index -= nentries;
+        }
+        if (index == ktsp->ks_start) {
+                /*
+                 * We've iterated around to the start, so we're done.
+                 */
+                ktep = NULL;
+        } else if ((new_index < index) && (index < ktsp->ks_index)) {
+                /*
+                 * We've skipped past the start again, so we're done.
+                 */
+                ktep = NULL;
+                ktsp->ks_index = ktsp->ks_start;
+        } else {
+                ktep = &(ktp->kt_entries[new_index]);
+                new_index++;
+                if (new_index == nentries) {
+                        ktsp->ks_index = 0;
+                } else {
+                        ktsp->ks_index = new_index;
+                }
+        }
+        return ktep;
+}
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
new file mode 100644
index 000000000000..92d1a1a5d04b
--- /dev/null
+++ b/fs/xfs/support/ktrace.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KTRACE_H__
+#define __XFS_SUPPORT_KTRACE_H__
+#include <spin.h>
+/*
+ * Trace buffer entry structure.
+ */
+typedef struct ktrace_entry {
+        void    *val[16];
+} ktrace_entry_t;
+/*
+ * Trace buffer header structure.
+ */
+typedef struct ktrace {
+        lock_t          kt_lock;        /* mutex to guard counters */
+        int             kt_nentries;    /* number of entries in trace buf */
+        int             kt_index;       /* current index in entries */
+        int             kt_rollover;
+        ktrace_entry_t  *kt_entries;    /* buffer of entries */
+} ktrace_t;
+/*
+ * Trace buffer snapshot structure.
+ */
+typedef struct ktrace_snap {
+        int             ks_start;       /* kt_index at time of snap */
+        int             ks_index;       /* current index */
+} ktrace_snap_t;
+#ifdef CONFIG_XFS_TRACE
+extern void ktrace_init(int zentries);
+extern void ktrace_uninit(void);
+extern ktrace_t *ktrace_alloc(int, int);
+extern void ktrace_free(ktrace_t *);
+extern void ktrace_enter(
+        ktrace_t        *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *,
+        void            *);
+extern ktrace_entry_t   *ktrace_first(ktrace_t *, ktrace_snap_t *);
+extern int              ktrace_nentries(ktrace_t *);
+extern ktrace_entry_t   *ktrace_next(ktrace_t *, ktrace_snap_t *);
+extern ktrace_entry_t   *ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
+#else
+#define ktrace_init(x)  do { } while (0)
+#define ktrace_uninit() do { } while (0)
+#endif  /* CONFIG_XFS_TRACE */
+#endif  /* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c
new file mode 100644
index 000000000000..15b5194f16b2
--- /dev/null
+++ b/fs/xfs/support/move.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include <xfs.h>
+/* Read from kernel buffer at src to user/kernel buffer defined
+ * by the uio structure. Advance the pointer in the uio struct
+ * as we go.
+ */
+int
+uio_read(caddr_t src, size_t len, struct uio *uio)
+{
+        size_t  count;
+        if (!len || !uio->uio_resid)
+                return 0;
+        count = uio->uio_iov->iov_len;
+        if (!count)
+                return 0;
+        if (count > len)
+                count = len;
+        if (uio->uio_segflg == UIO_USERSPACE) {
+                if (copy_to_user(uio->uio_iov->iov_base, src, count))
+                        return EFAULT;
+        } else {
+                ASSERT(uio->uio_segflg == UIO_SYSSPACE);
+                memcpy(uio->uio_iov->iov_base, src, count);
+        }
+        uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count);
+        uio->uio_iov->iov_len -= count;
+        uio->uio_offset += count;
+        uio->uio_resid -= count;
+        return 0;
+}
diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h
new file mode 100644
index 000000000000..3d406dc1c89e
--- /dev/null
+++ b/fs/xfs/support/move.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ * 
+ * Portions Copyright (c) 1982, 1986, 1993, 1994
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef __XFS_SUPPORT_MOVE_H__
+#define __XFS_SUPPORT_MOVE_H__
+#include <linux/uio.h>
+#include <asm/uaccess.h>
+/* Segment flag values. */
+enum uio_seg {
+        UIO_USERSPACE,          /* from user data space */
+        UIO_SYSSPACE,           /* from system space */
+};
+struct uio {
+        struct iovec    *uio_iov;   /* pointer to array of iovecs */
+        int             uio_iovcnt; /* number of iovecs in array */
+        xfs_off_t       uio_offset; /* offset in file this uio corresponds to */
+        int             uio_resid;  /* residual i/o count */
+        enum uio_seg    uio_segflg; /* see above */
+};
+typedef struct uio uio_t;
+typedef struct iovec iovec_t;
+extern int      uio_read (caddr_t, size_t, uio_t *);
+#endif  /* __XFS_SUPPORT_MOVE_H__ */
diff --git a/fs/xfs/support/qsort.c b/fs/xfs/support/qsort.c
new file mode 100644
index 000000000000..1ec824140cf7
--- /dev/null
+++ b/fs/xfs/support/qsort.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 1992, 1993
+ *      The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+/*
+ * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ */
+#define swapcode(TYPE, parmi, parmj, n) {               \
+        long i = (n) / sizeof (TYPE);                   \
+        register TYPE *pi = (TYPE *) (parmi);           \
+        register TYPE *pj = (TYPE *) (parmj);           \
+        do {                                            \
+                register TYPE   t = *pi;                \
+                *pi++ = *pj;                            \
+                *pj++ = t;                              \
+        } while (--i > 0);                              \
+}
+#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
+        es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
+static __inline void
+swapfunc(char *a, char *b, int n, int swaptype)
+{
+        if (swaptype <= 1) 
+                swapcode(long, a, b, n)
+        else
+                swapcode(char, a, b, n)
+}
+#define swap(a, b)                                      \
+        if (swaptype == 0) {                            \
+                long t = *(long *)(a);                  \
+                *(long *)(a) = *(long *)(b);            \
+                *(long *)(b) = t;                       \
+        } else                                          \
+                swapfunc(a, b, es, swaptype)
+#define vecswap(a, b, n)        if ((n) > 0) swapfunc(a, b, n, swaptype)
+static __inline char *
+med3(char *a, char *b, char *c, int (*cmp)(const void *, const void *))
+{
+        return cmp(a, b) < 0 ?
+               (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a ))
+              :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
+}
+void
+qsort(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *))
+{
+        char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+        int d, r, swaptype, swap_cnt;
+        register char *a = aa;
+loop:   SWAPINIT(a, es);
+        swap_cnt = 0;
+        if (n < 7) {
+                for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es)
+                        for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
+                             pl -= es)
+                                swap(pl, pl - es);
+                return;
+        }
+        pm = (char *)a + (n / 2) * es;
+        if (n > 7) {
+                pl = (char *)a;
+                pn = (char *)a + (n - 1) * es;
+                if (n > 40) {
+                        d = (n / 8) * es;
+                        pl = med3(pl, pl + d, pl + 2 * d, cmp);
+                        pm = med3(pm - d, pm, pm + d, cmp);
+                        pn = med3(pn - 2 * d, pn - d, pn, cmp);
+                }
+                pm = med3(pl, pm, pn, cmp);
+        }
+        swap(a, pm);
+        pa = pb = (char *)a + es;
+        pc = pd = (char *)a + (n - 1) * es;
+        for (;;) {
+                while (pb <= pc && (r = cmp(pb, a)) <= 0) {
+                        if (r == 0) {
+                                swap_cnt = 1;
+                                swap(pa, pb);
+                                pa += es;
+                        }
+                        pb += es;
+                }
+                while (pb <= pc && (r = cmp(pc, a)) >= 0) {
+                        if (r == 0) {
+                                swap_cnt = 1;
+                                swap(pc, pd);
+                                pd -= es;
+                        }
+                        pc -= es;
+                }
+                if (pb > pc)
+                        break;
+                swap(pb, pc);
+                swap_cnt = 1;
+                pb += es;
+                pc -= es;
+        }
+        if (swap_cnt == 0) {  /* Switch to insertion sort */
+                for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+                        for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; 
+                             pl -= es)
+                                swap(pl, pl - es);
+                return;
+        }
+        pn = (char *)a + n * es;
+        r = min(pa - (char *)a, pb - pa);
+        vecswap(a, pb - r, r);
+        r = min((long)(pd - pc), (long)(pn - pd - es));
+        vecswap(pb, pn - r, r);
+        if ((r = pb - pa) > es)
+                qsort(a, r / es, es, cmp);
+        if ((r = pd - pc) > es) { 
+                /* Iterate rather than recurse to save stack space */
+                a = pn - r;
+                n = r / es;
+                goto loop;
+        }
+/*              qsort(pn - r, r / es, es, cmp);*/
+}
diff --git a/fs/xfs/support/qsort.h b/fs/xfs/support/qsort.h
new file mode 100644
index 000000000000..94263106d716
--- /dev/null
+++ b/fs/xfs/support/qsort.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef QSORT_H
+#define QSORT_H
+extern void qsort (void *const pbase,
+                    size_t total_elems,
+                    size_t size,
+                    int (*cmp)(const void *, const void *));
+#endif
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
new file mode 100644
index 000000000000..81f40cfcb267
--- /dev/null
+++ b/fs/xfs/support/uuid.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include <xfs.h>
+static mutex_t  uuid_monitor;
+static int      uuid_table_size;
+static uuid_t   *uuid_table;
+void
+uuid_init(void)
+{
+        mutex_init(&uuid_monitor, MUTEX_DEFAULT, "uuid_monitor");
+}
+/*
+ * uuid_getnodeuniq - obtain the node unique fields of a UUID.
+ *
+ * This is not in any way a standard or condoned UUID function;
+ * it just something that's needed for user-level file handles.
+ */
+void
+uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
+{
+        char    *uu = (char *)uuid;
+        /* on IRIX, this function assumes big-endian fields within
+         * the uuid, so we use INT_GET to get the same result on
+         * little-endian systems
+         */
+        fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) +
+                   INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT);
+        fsid[1] =  INT_GET(*(u_int32_t*)(uu  ), ARCH_CONVERT);
+}
+void
+uuid_create_nil(uuid_t *uuid)
+{
+        memset(uuid, 0, sizeof(*uuid));
+}
+int
+uuid_is_nil(uuid_t *uuid)
+{
+        int     i;
+        char    *cp = (char *)uuid;
+        if (uuid == NULL)
+                return 0;
+        /* implied check of version number here... */
+        for (i = 0; i < sizeof *uuid; i++)
+                if (*cp++) return 0;    /* not nil */
+        return 1;       /* is nil */
+}
+int
+uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
+{
+        return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
+}
+/*
+ * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
+ * 64-bit words.  NOTE: This function can not be changed EVER.  Although
+ * brain-dead, some applications depend on this 64-bit value remaining
+ * persistent.  Specifically, DMI vendors store the value as a persistent
+ * filehandle.
+ */
+__uint64_t
+uuid_hash64(uuid_t *uuid)
+{
+        __uint64_t      *sp = (__uint64_t *)uuid;
+        return sp[0] + sp[1];
+}
+int
+uuid_table_insert(uuid_t *uuid)
+{
+        int     i, hole;
+        mutex_lock(&uuid_monitor, PVFS);
+        for (i = 0, hole = -1; i < uuid_table_size; i++) {
+                if (uuid_is_nil(&uuid_table[i])) {
+                        hole = i;
+                        continue;
+                }
+                if (uuid_equal(uuid, &uuid_table[i])) {
+                        mutex_unlock(&uuid_monitor);
+                        return 0;
+                }
+        }
+        if (hole < 0) {
+                uuid_table = kmem_realloc(uuid_table,
+                        (uuid_table_size + 1) * sizeof(*uuid_table),
+                        uuid_table_size  * sizeof(*uuid_table),
+                        KM_SLEEP);
+                hole = uuid_table_size++;
+        }
+        uuid_table[hole] = *uuid;
+        mutex_unlock(&uuid_monitor);
+        return 1;
+}
+void
+uuid_table_remove(uuid_t *uuid)
+{
+        int     i;
+        mutex_lock(&uuid_monitor, PVFS);
+        for (i = 0; i < uuid_table_size; i++) {
+                if (uuid_is_nil(&uuid_table[i]))
+                        continue;
+                if (!uuid_equal(uuid, &uuid_table[i]))
+                        continue;
+                uuid_create_nil(&uuid_table[i]);
+                break;
+        }
+        ASSERT(i < uuid_table_size);
+        mutex_unlock(&uuid_monitor);
+}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
new file mode 100644
index 000000000000..5220ea58ba2b
--- /dev/null
+++ b/fs/xfs/support/uuid.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_UUID_H__
+#define __XFS_SUPPORT_UUID_H__
+typedef struct {
+        unsigned char   __u_bits[16];
+} uuid_t;
+void uuid_init(void);
+void uuid_create_nil(uuid_t *uuid);
+int uuid_is_nil(uuid_t *uuid);
+int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
+void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+__uint64_t uuid_hash64(uuid_t *uuid);
+int uuid_table_insert(uuid_t *uuid);
+void uuid_table_remove(uuid_t *uuid);
+#endif  /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
new file mode 100644
index 000000000000..7e276dcaf4dc
--- /dev/null
+++ b/fs/xfs/xfs.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_H__
+#define __XFS_H__
+#include <linux-2.6/xfs_linux.h>
+#include <xfs_fs.h> 
+#include <xfs_macros.h>
+#endif  /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
new file mode 100644
index 000000000000..8d01dce8c532
--- /dev/null
+++ b/fs/xfs/xfs_acl.c
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_inum.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_acl.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include <linux/posix_acl_xattr.h>
+STATIC int      xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *);
+STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
+STATIC void     xfs_acl_get_endian(xfs_acl_t *);
+STATIC int      xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
+STATIC int      xfs_acl_invalid(xfs_acl_t *);
+STATIC void     xfs_acl_sync_mode(mode_t, xfs_acl_t *);
+STATIC void     xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void     xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
+STATIC int      xfs_acl_allow_set(vnode_t *, int);
+kmem_zone_t *xfs_acl_zone;
+/*
+ * Test for existence of access ACL attribute as efficiently as possible.
+ */
+int
+xfs_acl_vhasacl_access(
+        vnode_t         *vp)
+{
+        int             error;
+        xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
+        return (error == 0);
+}
+/*
+ * Test for existence of default ACL attribute as efficiently as possible.
+ */
+int
+xfs_acl_vhasacl_default(
+        vnode_t         *vp)
+{
+        int             error;
+        if (vp->v_type != VDIR)
+                return 0;
+        xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
+        return (error == 0);
+}
+/*
+ * Convert from extended attribute representation to in-memory for XFS.
+ */
+STATIC int
+posix_acl_xattr_to_xfs(
+        posix_acl_xattr_header  *src,
+        size_t                  size,
+        xfs_acl_t               *dest)
+{
+        posix_acl_xattr_entry   *src_entry;
+        xfs_acl_entry_t         *dest_entry;
+        int                     n;
+        if (!src || !dest)
+                return EINVAL;
+        if (size < sizeof(posix_acl_xattr_header))
+                return EINVAL;
+        if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+                return EOPNOTSUPP;
+        memset(dest, 0, sizeof(xfs_acl_t));
+        dest->acl_cnt = posix_acl_xattr_count(size);
+        if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
+                return EINVAL;
+        /*
+         * acl_set_file(3) may request that we set default ACLs with
+         * zero length -- defend (gracefully) against that here.
+         */
+        if (!dest->acl_cnt)
+                return 0;
+        src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
+        dest_entry = &dest->acl_entry[0];
+        for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
+                dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
+                if (_ACL_PERM_INVALID(dest_entry->ae_perm))
+                        return EINVAL;
+                dest_entry->ae_tag  = le16_to_cpu(src_entry->e_tag);
+                switch(dest_entry->ae_tag) {
+                case ACL_USER:
+                case ACL_GROUP:
+                        dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
+                        break;
+                case ACL_USER_OBJ:
+                case ACL_GROUP_OBJ:
+                case ACL_MASK:
+                case ACL_OTHER:
+                        dest_entry->ae_id = ACL_UNDEFINED_ID;
+                        break;
+                default:
+                        return EINVAL;
+                }
+        }
+        if (xfs_acl_invalid(dest))
+                return EINVAL;
+        return 0;
+}
+/*
+ * Comparison function called from qsort().
+ * Primary key is ae_tag, secondary key is ae_id.
+ */
+STATIC int
+xfs_acl_entry_compare(
+        const void      *va,
+        const void      *vb)
+{
+        xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
+                        *b = (xfs_acl_entry_t *)vb;
+        if (a->ae_tag == b->ae_tag)
+                return (a->ae_id - b->ae_id);
+        return (a->ae_tag - b->ae_tag);
+}
+/*
+ * Convert from in-memory XFS to extended attribute representation.
+ */
+STATIC int
+posix_acl_xfs_to_xattr(
+        xfs_acl_t               *src,
+        posix_acl_xattr_header  *dest,
+        size_t                  size)
+{
+        int                     n;
+        size_t                  new_size = posix_acl_xattr_size(src->acl_cnt);
+        posix_acl_xattr_entry   *dest_entry;
+        xfs_acl_entry_t         *src_entry;
+        if (size < new_size)
+                return -ERANGE;
+        /* Need to sort src XFS ACL by <ae_tag,ae_id> */
+        qsort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
+                xfs_acl_entry_compare);
+        dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+        dest_entry = &dest->a_entries[0];
+        src_entry = &src->acl_entry[0];
+        for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
+                dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
+                if (_ACL_PERM_INVALID(src_entry->ae_perm))
+                        return -EINVAL;
+                dest_entry->e_tag  = cpu_to_le16(src_entry->ae_tag);
+                switch (src_entry->ae_tag) {
+                case ACL_USER:
+                case ACL_GROUP:
+                        dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
+                                break;
+                case ACL_USER_OBJ:
+                case ACL_GROUP_OBJ:
+                case ACL_MASK:
+                case ACL_OTHER:
+                        dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+                        break;
+                default:
+                        return -EINVAL;
+                }
+        }
+        return new_size;
+}
+int
+xfs_acl_vget(
+        vnode_t         *vp,
+        void            *acl,
+        size_t          size,
+        int             kind)
+{
+        int                     error;
+        xfs_acl_t               *xfs_acl = NULL;
+        posix_acl_xattr_header  *ext_acl = acl;
+        int                     flags = 0;
+        VN_HOLD(vp);
+        if(size) {
+                if (!(_ACL_ALLOC(xfs_acl))) {
+                        error = ENOMEM;
+                        goto out;
+                }
+                memset(xfs_acl, 0, sizeof(xfs_acl_t));
+        } else
+                flags = ATTR_KERNOVAL;
+        xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
+        if (error)
+                goto out;
+        if (!size) {
+                error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
+        } else {
+                if (xfs_acl_invalid(xfs_acl)) {
+                        error = EINVAL;
+                        goto out;
+                }
+                if (kind == _ACL_TYPE_ACCESS) {
+                        vattr_t va;
+                        va.va_mask = XFS_AT_MODE;
+                        VOP_GETATTR(vp, &va, 0, sys_cred, error);
+                        if (error)
+                                goto out;
+                        xfs_acl_sync_mode(va.va_mode, xfs_acl);
+                }
+                error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
+        }
+out:
+        VN_RELE(vp);
+        if(xfs_acl)
+                _ACL_FREE(xfs_acl);
+        return -error;
+}
+int
+xfs_acl_vremove(
+        vnode_t         *vp,
+        int             kind)
+{
+        int             error;
+        VN_HOLD(vp);
+        error = xfs_acl_allow_set(vp, kind);
+        if (!error) {
+                VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
+                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
+                                ATTR_ROOT, sys_cred, error);
+                if (error == ENOATTR)
+                        error = 0;      /* 'scool */
+        }
+        VN_RELE(vp);
+        return -error;
+}
+int
+xfs_acl_vset(
+        vnode_t                 *vp,
+        void                    *acl,
+        size_t                  size,
+        int                     kind)
+{
+        posix_acl_xattr_header  *ext_acl = acl;
+        xfs_acl_t               *xfs_acl;
+        int                     error;
+        int                     basicperms = 0; /* more than std unix perms? */
+        if (!acl)
+                return -EINVAL;
+        if (!(_ACL_ALLOC(xfs_acl)))
+                return -ENOMEM;
+        error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
+        if (error) {
+                _ACL_FREE(xfs_acl);
+                return -error;
+        }
+        if (!xfs_acl->acl_cnt) {
+                _ACL_FREE(xfs_acl);
+                return 0;
+        }
+        VN_HOLD(vp);
+        error = xfs_acl_allow_set(vp, kind);
+        if (error)
+                goto out;
+        /* Incoming ACL exists, set file mode based on its value */
+        if (kind == _ACL_TYPE_ACCESS)
+                xfs_acl_setmode(vp, xfs_acl, &basicperms);
+        /*
+         * If we have more than std unix permissions, set up the actual attr.
+         * Otherwise, delete any existing attr.  This prevents us from
+         * having actual attrs for permissions that can be stored in the
+         * standard permission bits.
+         */
+        if (!basicperms) {
+                xfs_acl_set_attr(vp, xfs_acl, kind, &error);
+        } else {
+                xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+        }
+out:
+        VN_RELE(vp);
+        _ACL_FREE(xfs_acl);
+        return -error;
+}
+int
+xfs_acl_iaccess(
+        xfs_inode_t     *ip,
+        mode_t          mode,
+        cred_t          *cr)
+{
+        xfs_acl_t       *acl;
+        int             rval;
+        if (!(_ACL_ALLOC(acl)))
+                return -1;
+        /* If the file has no ACL return -1. */
+        rval = sizeof(xfs_acl_t);
+        if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE,
+                        (char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) {
+                _ACL_FREE(acl);
+                return -1;
+        }
+        xfs_acl_get_endian(acl);
+        /* If the file has an empty ACL return -1. */
+        if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
+                _ACL_FREE(acl);
+                return -1;
+        }
+        /* Synchronize ACL with mode bits */
+        xfs_acl_sync_mode(ip->i_d.di_mode, acl);
+        rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
+        _ACL_FREE(acl);
+        return rval;
+}
+STATIC int
+xfs_acl_allow_set(
+        vnode_t         *vp,
+        int             kind)
+{
+        vattr_t         va;
+        int             error;
+        if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+                return EPERM;
+        if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR)
+                return ENOTDIR;
+        if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+                return EROFS;
+        va.va_mask = XFS_AT_UID;
+        VOP_GETATTR(vp, &va, 0, NULL, error);
+        if (error)
+                return error;
+        if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
+                return EPERM;
+        return error;
+}
+/*
+ * The access control process to determine the access permission:
+ *      if uid == file owner id, use the file owner bits.
+ *      if gid == file owner group id, use the file group bits.
+ *      scan ACL for a maching user or group, and use matched entry
+ *      permission. Use total permissions of all matching group entries,
+ *      until all acl entries are exhausted. The final permission produced
+ *      by matching acl entry or entries needs to be & with group permission.
+ *      if not owner, owning group, or matching entry in ACL, use file
+ *      other bits.  
+ */
+STATIC int
+xfs_acl_capability_check(
+        mode_t          mode,
+        cred_t          *cr)
+{
+        if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH))
+                return EACCES;
+        if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
+                return EACCES;
+        if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
+                return EACCES;
+        return 0;
+}
+/*
+ * Note: cr is only used here for the capability check if the ACL test fails.
+ *       It is not used to find out the credentials uid or groups etc, as was
+ *       done in IRIX. It is assumed that the uid and groups for the current
+ *       thread are taken from "current" instead of the cr parameter.
+ */
+STATIC int
+xfs_acl_access(
+        uid_t           fuid,
+        gid_t           fgid,
+        xfs_acl_t       *fap,
+        mode_t          md,
+        cred_t          *cr)
+{
+        xfs_acl_entry_t matched;
+        int             i, allows;
+        int             maskallows = -1;        /* true, but not 1, either */
+        int             seen_userobj = 0;
+        matched.ae_tag = 0;     /* Invalid type */
+        md >>= 6;       /* Normalize the bits for comparison */
+        for (i = 0; i < fap->acl_cnt; i++) {
+                /*
+                 * Break out if we've got a user_obj entry or
+                 * a user entry and the mask (and have processed USER_OBJ)
+                 */
+                if (matched.ae_tag == ACL_USER_OBJ)
+                        break;
+                if (matched.ae_tag == ACL_USER) {
+                        if (maskallows != -1 && seen_userobj)
+                                break;
+                        if (fap->acl_entry[i].ae_tag != ACL_MASK &&
+                            fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
+                                continue;
+                }
+                /* True if this entry allows the requested access */
+                allows = ((fap->acl_entry[i].ae_perm & md) == md);
+                switch (fap->acl_entry[i].ae_tag) {
+                case ACL_USER_OBJ:
+                        seen_userobj = 1;
+                        if (fuid != current->fsuid)
+                                continue;
+                        matched.ae_tag = ACL_USER_OBJ;
+                        matched.ae_perm = allows;
+                        break;
+                case ACL_USER:
+                        if (fap->acl_entry[i].ae_id != current->fsuid)
+                                continue;
+                        matched.ae_tag = ACL_USER;
+                        matched.ae_perm = allows;
+                        break;
+                case ACL_GROUP_OBJ:
+                        if ((matched.ae_tag == ACL_GROUP_OBJ ||
+                            matched.ae_tag == ACL_GROUP) && !allows)
+                                continue;
+                        if (!in_group_p(fgid))
+                                continue;
+                        matched.ae_tag = ACL_GROUP_OBJ;
+                        matched.ae_perm = allows;
+                        break;
+                case ACL_GROUP:
+                        if ((matched.ae_tag == ACL_GROUP_OBJ ||
+                            matched.ae_tag == ACL_GROUP) && !allows)
+                                continue;
+                        if (!in_group_p(fap->acl_entry[i].ae_id))
+                                continue;
+                        matched.ae_tag = ACL_GROUP;
+                        matched.ae_perm = allows;
+                        break;
+                case ACL_MASK:
+                        maskallows = allows;
+                        break;
+                case ACL_OTHER:
+                        if (matched.ae_tag != 0)
+                                continue;
+                        matched.ae_tag = ACL_OTHER;
+                        matched.ae_perm = allows;
+                        break;
+                }
+        }
+        /*
+         * First possibility is that no matched entry allows access.
+         * The capability to override DAC may exist, so check for it.
+         */
+        switch (matched.ae_tag) {
+        case ACL_OTHER:
+        case ACL_USER_OBJ:
+                if (matched.ae_perm)
+                        return 0;
+                break;
+        case ACL_USER:
+        case ACL_GROUP_OBJ:
+        case ACL_GROUP:
+                if (maskallows && matched.ae_perm)
+                        return 0;
+                break;
+        case 0:
+                break;
+        }
+        return xfs_acl_capability_check(md, cr);
+}
+/*
+ * ACL validity checker.
+ *   This acl validation routine checks each ACL entry read in makes sense.
+ */
+STATIC int
+xfs_acl_invalid(
+        xfs_acl_t       *aclp)
+{
+        xfs_acl_entry_t *entry, *e;
+        int             user = 0, group = 0, other = 0, mask = 0;
+        int             mask_required = 0;
+        int             i, j;
+        if (!aclp)
+                goto acl_invalid;
+        if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
+                goto acl_invalid;
+        for (i = 0; i < aclp->acl_cnt; i++) {
+                entry = &aclp->acl_entry[i];
+                switch (entry->ae_tag) {
+                case ACL_USER_OBJ:
+                        if (user++)
+                                goto acl_invalid;
+                        break;
+                case ACL_GROUP_OBJ:
+                        if (group++)
+                                goto acl_invalid;
+                        break;
+                case ACL_OTHER:
+                        if (other++)
+                                goto acl_invalid;
+                        break;
+                case ACL_USER:
+                case ACL_GROUP:
+                        for (j = i + 1; j < aclp->acl_cnt; j++) {
+                                e = &aclp->acl_entry[j];
+                                if (e->ae_id == entry->ae_id &&
+                                    e->ae_tag == entry->ae_tag)
+                                        goto acl_invalid;
+                        }
+                        mask_required++;
+                        break;
+                case ACL_MASK:
+                        if (mask++)
+                                goto acl_invalid;
+                        break;
+                default:
+                        goto acl_invalid;
+                }
+        }
+        if (!user || !group || !other || (mask_required && !mask))
+                goto acl_invalid;
+        else
+                return 0;
+acl_invalid:
+        return EINVAL;
+}
+/*
+ * Do ACL endian conversion.
+ */
+STATIC void
+xfs_acl_get_endian(
+        xfs_acl_t       *aclp)
+{
+        xfs_acl_entry_t *ace, *end;
+        INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
+        end = &aclp->acl_entry[0]+aclp->acl_cnt;
+        for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
+                INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
+                INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
+                INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+        }
+}
+/*
+ * Get the ACL from the EA and do endian conversion.
+ */
+STATIC void
+xfs_acl_get_attr(
+        vnode_t         *vp,
+        xfs_acl_t       *aclp,
+        int             kind,
+        int             flags,
+        int             *error)
+{
+        int             len = sizeof(xfs_acl_t);
+        ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
+        flags |= ATTR_ROOT;
+        VOP_ATTR_GET(vp,
+                kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
+                (char *)aclp, &len, flags, sys_cred, *error);
+        if (*error || (flags & ATTR_KERNOVAL))
+                return;
+        xfs_acl_get_endian(aclp);
+}
+/*
+ * Set the EA with the ACL and do endian conversion.
+ */
+STATIC void
+xfs_acl_set_attr(
+        vnode_t         *vp,
+        xfs_acl_t       *aclp,
+        int             kind,
+        int             *error)
+{
+        xfs_acl_entry_t *ace, *newace, *end;
+        xfs_acl_t       *newacl;
+        int             len;
+        if (!(_ACL_ALLOC(newacl))) {
+                *error = ENOMEM;
+                return;
+        }
+        len = sizeof(xfs_acl_t) -
+              (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
+        end = &aclp->acl_entry[0]+aclp->acl_cnt;
+        for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
+             ace < end;
+             ace++, newace++) {
+                INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
+                INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
+                INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+        }
+        INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
+        VOP_ATTR_SET(vp,
+                kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
+                (char *)newacl, len, ATTR_ROOT, sys_cred, *error);
+        _ACL_FREE(newacl);
+}
+int
+xfs_acl_vtoacl(
+        vnode_t         *vp,
+        xfs_acl_t       *access_acl,
+        xfs_acl_t       *default_acl)
+{
+        vattr_t         va;
+        int             error = 0;
+        if (access_acl) {
+                /*
+                 * Get the Access ACL and the mode.  If either cannot
+                 * be obtained for some reason, invalidate the access ACL.
+                 */
+                xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
+                if (!error) {
+                        /* Got the ACL, need the mode... */
+                        va.va_mask = XFS_AT_MODE;
+                        VOP_GETATTR(vp, &va, 0, sys_cred, error);
+                }
+                if (error)
+                        access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
+                else /* We have a good ACL and the file mode, synchronize. */
+                        xfs_acl_sync_mode(va.va_mode, access_acl);
+        }
+        if (default_acl) {
+                xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
+                if (error)
+                        default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
+        }
+        return error;
+}
+/*
+ * This function retrieves the parent directory's acl, processes it
+ * and lets the child inherit the acl(s) that it should.
+ */
+int
+xfs_acl_inherit(
+        vnode_t         *vp,
+        vattr_t         *vap,
+        xfs_acl_t       *pdaclp)
+{
+        xfs_acl_t       *cacl;
+        int             error = 0;
+        int             basicperms = 0;
+        /*
+         * If the parent does not have a default ACL, or it's an
+         * invalid ACL, we're done.
+         */
+        if (!vp)
+                return 0;
+        if (!pdaclp || xfs_acl_invalid(pdaclp))
+                return 0;
+        /*
+         * Copy the default ACL of the containing directory to
+         * the access ACL of the new file and use the mode that
+         * was passed in to set up the correct initial values for
+         * the u::,g::[m::], and o:: entries.  This is what makes
+         * umask() "work" with ACL's.
+         */
+        if (!(_ACL_ALLOC(cacl)))
+                return ENOMEM;
+        memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
+        xfs_acl_filter_mode(vap->va_mode, cacl);
+        xfs_acl_setmode(vp, cacl, &basicperms);
+        /*
+         * Set the Default and Access ACL on the file.  The mode is already
+         * set on the file, so we don't need to worry about that.
+         *
+         * If the new file is a directory, its default ACL is a copy of
+         * the containing directory's default ACL.
+         */
+        if (vp->v_type == VDIR)
+                xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
+        if (!error && !basicperms)
+                xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+        _ACL_FREE(cacl);
+        return error;
+}
+/*
+ * Set up the correct mode on the file based on the supplied ACL.  This
+ * makes sure that the mode on the file reflects the state of the
+ * u::,g::[m::], and o:: entries in the ACL.  Since the mode is where
+ * the ACL is going to get the permissions for these entries, we must
+ * synchronize the mode whenever we set the ACL on a file.
+ */
+STATIC int
+xfs_acl_setmode(
+        vnode_t         *vp,
+        xfs_acl_t       *acl,
+        int             *basicperms)
+{
+        vattr_t         va;
+        xfs_acl_entry_t *ap;
+        xfs_acl_entry_t *gap = NULL;
+        int             i, error, nomask = 1;
+        *basicperms = 1;
+        if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
+                return 0;
+        /*
+         * Copy the u::, g::, o::, and m:: bits from the ACL into the
+         * mode.  The m:: bits take precedence over the g:: bits.
+         */
+        va.va_mask = XFS_AT_MODE;
+        VOP_GETATTR(vp, &va, 0, sys_cred, error);
+        if (error)
+                return error;
+        va.va_mask = XFS_AT_MODE;
+        va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
+        ap = acl->acl_entry;
+        for (i = 0; i < acl->acl_cnt; ++i) {
+                switch (ap->ae_tag) {
+                case ACL_USER_OBJ:
+                        va.va_mode |= ap->ae_perm << 6;
+                        break;
+                case ACL_GROUP_OBJ:
+                        gap = ap;
+                        break;
+                case ACL_MASK:  /* more than just standard modes */
+                        nomask = 0;
+                        va.va_mode |= ap->ae_perm << 3;
+                        *basicperms = 0;
+                        break;
+                case ACL_OTHER:
+                        va.va_mode |= ap->ae_perm;
+                        break;
+                default:        /* more than just standard modes */
+                        *basicperms = 0;
+                        break;
+                }
+                ap++;
+        }
+        /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
+        if (gap && nomask)
+                va.va_mode |= gap->ae_perm << 3;
+        VOP_SETATTR(vp, &va, 0, sys_cred, error);
+        return error;
+}
+/*
+ * The permissions for the special ACL entries (u::, g::[m::], o::) are
+ * actually stored in the file mode (if there is both a group and a mask,
+ * the group is stored in the ACL entry and the mask is stored on the file).
+ * This allows the mode to remain automatically in sync with the ACL without
+ * the need for a call-back to the ACL system at every point where the mode
+ * could change.  This function takes the permissions from the specified mode
+ * and places it in the supplied ACL.
+ *
+ * This implementation draws its validity from the fact that, when the ACL
+ * was assigned, the mode was copied from the ACL.
+ * If the mode did not change, therefore, the mode remains exactly what was
+ * taken from the special ACL entries at assignment.
+ * If a subsequent chmod() was done, the POSIX spec says that the change in
+ * mode must cause an update to the ACL seen at user level and used for
+ * access checks.  Before and after a mode change, therefore, the file mode
+ * most accurately reflects what the special ACL entries should permit/deny.
+ *
+ * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
+ *         the existing mode bits will override whatever is in the
+ *         ACL. Similarly, if there is a pre-existing ACL that was
+ *         never in sync with its mode (owing to a bug in 6.5 and
+ *         before), it will now magically (or mystically) be
+ *         synchronized.  This could cause slight astonishment, but
+ *         it is better than inconsistent permissions.
+ *
+ * The supplied ACL is a template that may contain any combination
+ * of special entries.  These are treated as place holders when we fill
+ * out the ACL.  This routine does not add or remove special entries, it
+ * simply unites each special entry with its associated set of permissions.
+ */
+STATIC void
+xfs_acl_sync_mode(
+        mode_t          mode,
+        xfs_acl_t       *acl)
+{
+        int             i, nomask = 1;
+        xfs_acl_entry_t *ap;
+        xfs_acl_entry_t *gap = NULL;
+        /*
+         * Set ACL entries. POSIX1003.1eD16 requires that the MASK
+         * be set instead of the GROUP entry, if there is a MASK.
+         */
+        for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
+                switch (ap->ae_tag) {
+                case ACL_USER_OBJ:
+                        ap->ae_perm = (mode >> 6) & 0x7;
+                        break;
+                case ACL_GROUP_OBJ:
+                        gap = ap;
+                        break;
+                case ACL_MASK:
+                        nomask = 0;
+                        ap->ae_perm = (mode >> 3) & 0x7;
+                        break;
+                case ACL_OTHER:
+                        ap->ae_perm = mode & 0x7;
+                        break;
+                default:
+                        break;
+                }
+        }
+        /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
+        if (gap && nomask)
+                gap->ae_perm = (mode >> 3) & 0x7;
+}
+/*
+ * When inheriting an Access ACL from a directory Default ACL,
+ * the ACL bits are set to the intersection of the ACL default
+ * permission bits and the file permission bits in mode. If there
+ * are no permission bits on the file then we must not give them
+ * the ACL. This is what what makes umask() work with ACLs.
+ */
+STATIC void
+xfs_acl_filter_mode(
+        mode_t          mode,
+        xfs_acl_t       *acl)
+{
+        int             i, nomask = 1;
+        xfs_acl_entry_t *ap;
+        xfs_acl_entry_t *gap = NULL;
+        /*
+         * Set ACL entries. POSIX1003.1eD16 requires that the MASK
+         * be merged with GROUP entry, if there is a MASK.
+         */
+        for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
+                switch (ap->ae_tag) {
+                case ACL_USER_OBJ:
+                        ap->ae_perm &= (mode >> 6) & 0x7;
+                        break;
+                case ACL_GROUP_OBJ:
+                        gap = ap;
+                        break;
+                case ACL_MASK:
+                        nomask = 0;
+                        ap->ae_perm &= (mode >> 3) & 0x7;
+                        break;
+                case ACL_OTHER:
+                        ap->ae_perm &= mode & 0x7;
+                        break;
+                default:
+                        break;
+                }
+        }
+        /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
+        if (gap && nomask)
+                gap->ae_perm &= (mode >> 3) & 0x7;
+}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
new file mode 100644
index 000000000000..0363eb46d357
--- /dev/null
+++ b/fs/xfs/xfs_acl.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2001-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ACL_H__
+#define __XFS_ACL_H__
+/*
+ * Access Control Lists
+ */
+typedef __uint16_t      xfs_acl_perm_t;
+typedef __int32_t       xfs_acl_type_t;
+typedef __int32_t       xfs_acl_tag_t;
+typedef __int32_t       xfs_acl_id_t;
+#define XFS_ACL_MAX_ENTRIES 25
+#define XFS_ACL_NOT_PRESENT (-1)
+typedef struct xfs_acl_entry {
+        xfs_acl_tag_t   ae_tag;
+        xfs_acl_id_t    ae_id;
+        xfs_acl_perm_t  ae_perm;
+} xfs_acl_entry_t;
+typedef struct xfs_acl {
+        __int32_t       acl_cnt;
+        xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES];
+} xfs_acl_t;
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE    "SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE       (sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
+#ifdef CONFIG_XFS_POSIX_ACL
+struct vattr;
+struct vnode;
+struct xfs_inode;
+extern struct kmem_zone *xfs_acl_zone;
+#define xfs_acl_zone_init(zone, name)   \
+                (zone) = kmem_zone_init(sizeof(xfs_acl_t), name)
+#define xfs_acl_zone_destroy(zone)      kmem_cache_destroy(zone)
+extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
+extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
+extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vhasacl_access(struct vnode *);
+extern int xfs_acl_vhasacl_default(struct vnode *);
+extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vremove(struct vnode *vp, int);
+#define _ACL_TYPE_ACCESS        1
+#define _ACL_TYPE_DEFAULT       2
+#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
+#define _ACL_INHERIT(c,v,d)     (xfs_acl_inherit(c,v,d))
+#define _ACL_GET_ACCESS(pv,pa)  (xfs_acl_vtoacl(pv,pa,NULL) == 0)
+#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
+#define _ACL_ACCESS_EXISTS      xfs_acl_vhasacl_access
+#define _ACL_DEFAULT_EXISTS     xfs_acl_vhasacl_default
+#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1)
+#define _ACL_ALLOC(a)           ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
+#define _ACL_FREE(a)            ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
+#else
+#define xfs_acl_zone_init(zone,name)
+#define xfs_acl_zone_destroy(zone)
+#define xfs_acl_vset(v,p,sz,t)  (-EOPNOTSUPP)
+#define xfs_acl_vget(v,p,sz,t)  (-EOPNOTSUPP)
+#define xfs_acl_vremove(v,t)    (-EOPNOTSUPP)
+#define xfs_acl_vhasacl_access(v)       (0)
+#define xfs_acl_vhasacl_default(v)      (0)
+#define _ACL_ALLOC(a)           (1)     /* successfully allocate nothing */
+#define _ACL_FREE(a)            ((void)0)
+#define _ACL_INHERIT(c,v,d)     (0)
+#define _ACL_GET_ACCESS(pv,pa)  (0)
+#define _ACL_GET_DEFAULT(pv,pd) (0)
+#define _ACL_ACCESS_EXISTS      (NULL)
+#define _ACL_DEFAULT_EXISTS     (NULL)
+#define _ACL_XFS_IACCESS(i,m,c) (-1)
+#endif
+#endif  /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
new file mode 100644
index 000000000000..96b70f7fba39
--- /dev/null
+++ b/fs/xfs/xfs_ag.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_AG_H__
+#define __XFS_AG_H__
+/*
+ * Allocation group header
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+#define XFS_AGF_MAGIC   0x58414746      /* 'XAGF' */
+#define XFS_AGI_MAGIC   0x58414749      /* 'XAGI' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_GOOD_VERSION)
+int xfs_agf_good_version(unsigned v);
+#define XFS_AGF_GOOD_VERSION(v) xfs_agf_good_version(v)
+#else
+#define XFS_AGF_GOOD_VERSION(v)         ((v) == XFS_AGF_VERSION)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_GOOD_VERSION)
+int xfs_agi_good_version(unsigned v);
+#define XFS_AGI_GOOD_VERSION(v) xfs_agi_good_version(v)
+#else
+#define XFS_AGI_GOOD_VERSION(v)         ((v) == XFS_AGI_VERSION)
+#endif
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF   ((int)XFS_BTNUM_CNTi + 1)
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+typedef struct xfs_agf
+{
+        /*
+         * Common allocation group header information
+         */
+        __uint32_t      agf_magicnum;   /* magic number == XFS_AGF_MAGIC */
+        __uint32_t      agf_versionnum; /* header version == XFS_AGF_VERSION */
+        xfs_agnumber_t  agf_seqno;      /* sequence # starting from 0 */
+        xfs_agblock_t   agf_length;     /* size in blocks of a.g. */
+        /*
+         * Freespace information
+         */
+        xfs_agblock_t   agf_roots[XFS_BTNUM_AGF];       /* root blocks */
+        __uint32_t      agf_spare0;     /* spare field */
+        __uint32_t      agf_levels[XFS_BTNUM_AGF];      /* btree levels */
+        __uint32_t      agf_spare1;     /* spare field */
+        __uint32_t      agf_flfirst;    /* first freelist block's index */
+        __uint32_t      agf_fllast;     /* last freelist block's index */
+        __uint32_t      agf_flcount;    /* count of blocks in freelist */
+        xfs_extlen_t    agf_freeblks;   /* total free blocks */
+        xfs_extlen_t    agf_longest;    /* longest free space */
+} xfs_agf_t;
+#define XFS_AGF_MAGICNUM        0x00000001
+#define XFS_AGF_VERSIONNUM      0x00000002
+#define XFS_AGF_SEQNO           0x00000004
+#define XFS_AGF_LENGTH          0x00000008
+#define XFS_AGF_ROOTS           0x00000010
+#define XFS_AGF_LEVELS          0x00000020
+#define XFS_AGF_FLFIRST         0x00000040
+#define XFS_AGF_FLLAST          0x00000080
+#define XFS_AGF_FLCOUNT         0x00000100
+#define XFS_AGF_FREEBLKS        0x00000200
+#define XFS_AGF_LONGEST         0x00000400
+#define XFS_AGF_NUM_BITS        11
+#define XFS_AGF_ALL_BITS        ((1 << XFS_AGF_NUM_BITS) - 1)
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR(mp)       ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_BLOCK)
+xfs_agblock_t xfs_agf_block(struct xfs_mount *mp);
+#define XFS_AGF_BLOCK(mp)       xfs_agf_block(mp)
+#else
+#define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
+#endif
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS        64
+typedef struct xfs_agi
+{
+        /*
+         * Common allocation group header information
+         */
+        __uint32_t      agi_magicnum;   /* magic number == XFS_AGI_MAGIC */
+        __uint32_t      agi_versionnum; /* header version == XFS_AGI_VERSION */
+        xfs_agnumber_t  agi_seqno;      /* sequence # starting from 0 */
+        xfs_agblock_t   agi_length;     /* size in blocks of a.g. */
+        /*
+         * Inode information
+         * Inodes are mapped by interpreting the inode number, so no
+         * mapping data is needed here.
+         */
+        xfs_agino_t     agi_count;      /* count of allocated inodes */
+        xfs_agblock_t   agi_root;       /* root of inode btree */
+        __uint32_t      agi_level;      /* levels in inode btree */
+        xfs_agino_t     agi_freecount;  /* number of free inodes */
+        xfs_agino_t     agi_newino;     /* new inode just allocated */
+        xfs_agino_t     agi_dirino;     /* last directory inode chunk */
+        /*
+         * Hash table of inodes which have been unlinked but are
+         * still being referenced.
+         */
+        xfs_agino_t     agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+} xfs_agi_t;
+#define XFS_AGI_MAGICNUM        0x00000001
+#define XFS_AGI_VERSIONNUM      0x00000002
+#define XFS_AGI_SEQNO           0x00000004
+#define XFS_AGI_LENGTH          0x00000008
+#define XFS_AGI_COUNT           0x00000010
+#define XFS_AGI_ROOT            0x00000020
+#define XFS_AGI_LEVEL           0x00000040
+#define XFS_AGI_FREECOUNT       0x00000080
+#define XFS_AGI_NEWINO          0x00000100
+#define XFS_AGI_DIRINO          0x00000200
+#define XFS_AGI_UNLINKED        0x00000400
+#define XFS_AGI_NUM_BITS        11
+#define XFS_AGI_ALL_BITS        ((1 << XFS_AGI_NUM_BITS) - 1)
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR(mp)       ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_BLOCK)
+xfs_agblock_t xfs_agi_block(struct xfs_mount *mp);
+#define XFS_AGI_BLOCK(mp)       xfs_agi_block(mp)
+#else
+#define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
+#endif
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR(mp)      ((xfs_daddr_t)(3 << (mp)->m_sectbb_log))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGFL_BLOCK)
+xfs_agblock_t xfs_agfl_block(struct xfs_mount *mp);
+#define XFS_AGFL_BLOCK(mp)      xfs_agfl_block(mp)
+#else
+#define XFS_AGFL_BLOCK(mp)      XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp))
+#endif
+#define XFS_AGFL_SIZE(mp)       ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t))
+typedef struct xfs_agfl {
+        xfs_agblock_t   agfl_bno[1];    /* actually XFS_AGFL_SIZE(mp) */
+} xfs_agfl_t;
+/*
+ * Busy block/extent entry.  Used in perag to mark blocks that have been freed
+ * but whose transactions aren't committed to disk yet.
+ */
+typedef struct xfs_perag_busy {
+        xfs_agblock_t   busy_start;
+        xfs_extlen_t    busy_length;
+        struct xfs_trans *busy_tp;      /* transaction that did the free */
+} xfs_perag_busy_t;
+/*
+ * Per-ag incore structure, copies of information in agf and agi,
+ * to improve the performance of allocation group selection.
+ *
+ * pick sizes which fit in allocation buckets well
+ */
+#if (BITS_PER_LONG == 32)
+#define XFS_PAGB_NUM_SLOTS      84
+#elif (BITS_PER_LONG == 64)
+#define XFS_PAGB_NUM_SLOTS      128
+#endif
+typedef struct xfs_perag
+{
+        char            pagf_init;      /* this agf's entry is initialized */
+        char            pagi_init;      /* this agi's entry is initialized */
+        char            pagf_metadata;  /* the agf is prefered to be metadata */
+        char            pagi_inodeok;   /* The agi is ok for inodes */
+        __uint8_t       pagf_levels[XFS_BTNUM_AGF];
+                                        /* # of levels in bno & cnt btree */
+        __uint32_t      pagf_flcount;   /* count of blocks in freelist */
+        xfs_extlen_t    pagf_freeblks;  /* total free blocks */
+        xfs_extlen_t    pagf_longest;   /* longest free space */
+        xfs_agino_t     pagi_freecount; /* number of free inodes */
+#ifdef __KERNEL__
+        lock_t          pagb_lock;      /* lock for pagb_list */
+#endif
+        int             pagb_count;     /* pagb slots in use */
+        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
+} xfs_perag_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAXLEVELS)
+int xfs_ag_maxlevels(struct xfs_mount *mp);
+#define XFS_AG_MAXLEVELS(mp)            xfs_ag_maxlevels(mp)
+#else
+#define XFS_AG_MAXLEVELS(mp)    ((mp)->m_ag_maxlevels)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST)
+int xfs_min_freelist(xfs_agf_t *a, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST(a,mp)          xfs_min_freelist(a,mp)
+#else
+#define XFS_MIN_FREELIST(a,mp)  \
+        XFS_MIN_FREELIST_RAW(   \
+                INT_GET((a)->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT), \
+                INT_GET((a)->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT), mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_PAG)
+int xfs_min_freelist_pag(xfs_perag_t *pag, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST_PAG(pag,mp)    xfs_min_freelist_pag(pag,mp)
+#else
+#define XFS_MIN_FREELIST_PAG(pag,mp)    \
+        XFS_MIN_FREELIST_RAW((uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+                             (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_RAW)
+int xfs_min_freelist_raw(int bl, int cl, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)  xfs_min_freelist_raw(bl,cl,mp)
+#else
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
+        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + \
+         MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_FSB)
+xfs_fsblock_t xfs_agb_to_fsb(struct xfs_mount *mp, xfs_agnumber_t agno,
+                             xfs_agblock_t agbno);
+#define XFS_AGB_TO_FSB(mp,agno,agbno)   xfs_agb_to_fsb(mp,agno,agbno)
+#else
+#define XFS_AGB_TO_FSB(mp,agno,agbno) \
+        (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGNO)
+xfs_agnumber_t xfs_fsb_to_agno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_AGNO(mp,fsbno)       xfs_fsb_to_agno(mp,fsbno)
+#else
+#define XFS_FSB_TO_AGNO(mp,fsbno) \
+        ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGBNO)
+xfs_agblock_t xfs_fsb_to_agbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_AGBNO(mp,fsbno)      xfs_fsb_to_agbno(mp,fsbno)
+#else
+#define XFS_FSB_TO_AGBNO(mp,fsbno) \
+        ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_DADDR)
+xfs_daddr_t xfs_agb_to_daddr(struct xfs_mount *mp, xfs_agnumber_t agno,
+                                xfs_agblock_t agbno);
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) xfs_agb_to_daddr(mp,agno,agbno)
+#else
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+        ((xfs_daddr_t)(XFS_FSB_TO_BB(mp, \
+                (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))))
+#endif
+/*
+ * XFS_DADDR_TO_AGNO and XFS_DADDR_TO_AGBNO moved to xfs_mount.h
+ * to avoid header file ordering change
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_DADDR)
+xfs_daddr_t xfs_ag_daddr(struct xfs_mount *mp, xfs_agnumber_t agno,
+                                xfs_daddr_t d);
+#define XFS_AG_DADDR(mp,agno,d)         xfs_ag_daddr(mp,agno,d)
+#else
+#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGF)
+xfs_agf_t *xfs_buf_to_agf(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGF(bp)              xfs_buf_to_agf(bp)
+#else
+#define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)XFS_BUF_PTR(bp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGI)
+xfs_agi_t *xfs_buf_to_agi(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGI(bp)              xfs_buf_to_agi(bp)
+#else
+#define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)XFS_BUF_PTR(bp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGFL)
+xfs_agfl_t *xfs_buf_to_agfl(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGFL(bp)             xfs_buf_to_agfl(bp)
+#else
+#define XFS_BUF_TO_AGFL(bp)     ((xfs_agfl_t *)XFS_BUF_PTR(bp))
+#endif
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_CHECK_DADDR)
+void xfs_ag_check_daddr(struct xfs_mount *mp, xfs_daddr_t d, xfs_extlen_t len);
+#define XFS_AG_CHECK_DADDR(mp,d,len)    xfs_ag_check_daddr(mp,d,len)
+#else
+#define XFS_AG_CHECK_DADDR(mp,d,len)    \
+        ((len) == 1 ? \
+            ASSERT((d) == XFS_SB_DADDR || \
+                   XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
+            ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
+                   XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+#endif
+#endif  /* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
new file mode 100644
index 000000000000..36603db10fe9
--- /dev/null
+++ b/fs/xfs/xfs_alloc.c
@@ -0,0 +1,2623 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * Free space allocation for XFS.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_error.h"
+#define XFS_ABSDIFF(a,b)        (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+#define XFSA_FIXUP_BNO_OK       1
+#define XFSA_FIXUP_CNT_OK       2
+int
+xfs_alloc_search_busy(xfs_trans_t *tp,
+                    xfs_agnumber_t agno,
+                    xfs_agblock_t bno,
+                    xfs_extlen_t len);
+#if defined(XFS_ALLOC_TRACE)
+ktrace_t *xfs_alloc_trace_buf;
+#define TRACE_ALLOC(s,a)        \
+        xfs_alloc_trace_alloc(fname, s, a, __LINE__)
+#define TRACE_FREE(s,a,b,x,f)   \
+        xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__)
+#define TRACE_MODAGF(s,a,f)     \
+        xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__)
+#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp)      \
+        xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+#define TRACE_UNBUSY(fname,s,ag,sl,tp)  \
+        xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)        \
+        xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+#else
+#define TRACE_ALLOC(s,a)
+#define TRACE_FREE(s,a,b,x,f)
+#define TRACE_MODAGF(s,a,f)
+#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
+#define TRACE_UNBUSY(fname,s,ag,sl,tp)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#endif  /* XFS_ALLOC_TRACE */
+/*
+ * Prototypes for per-ag allocation routines
+ */
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+        xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+/*
+ * Internal functions.
+ */
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC int                              /* success (>= minlen) */
+xfs_alloc_compute_aligned(
+        xfs_agblock_t   foundbno,       /* starting block in found extent */
+        xfs_extlen_t    foundlen,       /* length in found extent */
+        xfs_extlen_t    alignment,      /* alignment for allocation */
+        xfs_extlen_t    minlen,         /* minimum length for allocation */
+        xfs_agblock_t   *resbno,        /* result block number */
+        xfs_extlen_t    *reslen)        /* result length */
+{
+        xfs_agblock_t   bno;
+        xfs_extlen_t    diff;
+        xfs_extlen_t    len;
+        if (alignment > 1 && foundlen >= minlen) {
+                bno = roundup(foundbno, alignment);
+                diff = bno - foundbno;
+                len = diff >= foundlen ? 0 : foundlen - diff;
+        } else {
+                bno = foundbno;
+                len = foundlen;
+        }
+        *resbno = bno;
+        *reslen = len;
+        return len >= minlen;
+}
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t                     /* difference value (absolute) */
+xfs_alloc_compute_diff(
+        xfs_agblock_t   wantbno,        /* target starting block */
+        xfs_extlen_t    wantlen,        /* target length */
+        xfs_extlen_t    alignment,      /* target alignment */
+        xfs_agblock_t   freebno,        /* freespace's starting block */
+        xfs_extlen_t    freelen,        /* freespace's length */
+        xfs_agblock_t   *newbnop)       /* result: best start block from free */
+{
+        xfs_agblock_t   freeend;        /* end of freespace extent */
+        xfs_agblock_t   newbno1;        /* return block number */
+        xfs_agblock_t   newbno2;        /* other new block number */
+        xfs_extlen_t    newlen1=0;      /* length with newbno1 */
+        xfs_extlen_t    newlen2=0;      /* length with newbno2 */
+        xfs_agblock_t   wantend;        /* end of target extent */
+        ASSERT(freelen >= wantlen);
+        freeend = freebno + freelen;
+        wantend = wantbno + wantlen;
+        if (freebno >= wantbno) {
+                if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+                        newbno1 = NULLAGBLOCK;
+        } else if (freeend >= wantend && alignment > 1) {
+                newbno1 = roundup(wantbno, alignment);
+                newbno2 = newbno1 - alignment;
+                if (newbno1 >= freeend)
+                        newbno1 = NULLAGBLOCK;
+                else
+                        newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+                if (newbno2 < freebno)
+                        newbno2 = NULLAGBLOCK;
+                else
+                        newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+                if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+                        if (newlen1 < newlen2 ||
+                            (newlen1 == newlen2 &&
+                             XFS_ABSDIFF(newbno1, wantbno) >
+                             XFS_ABSDIFF(newbno2, wantbno)))
+                                newbno1 = newbno2;
+                } else if (newbno2 != NULLAGBLOCK)
+                        newbno1 = newbno2;
+        } else if (freeend >= wantend) {
+                newbno1 = wantbno;
+        } else if (alignment > 1) {
+                newbno1 = roundup(freeend - wantlen, alignment);
+                if (newbno1 > freeend - wantlen &&
+                    newbno1 - alignment >= freebno)
+                        newbno1 -= alignment;
+                else if (newbno1 >= freeend)
+                        newbno1 = NULLAGBLOCK;
+        } else
+                newbno1 = freeend - wantlen;
+        *newbnop = newbno1;
+        return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_extlen_t    k;
+        xfs_extlen_t    rlen;
+        ASSERT(args->mod < args->prod);
+        rlen = args->len;
+        ASSERT(rlen >= args->minlen);
+        ASSERT(rlen <= args->maxlen);
+        if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+            (args->mod == 0 && rlen < args->prod))
+                return;
+        k = rlen % args->prod;
+        if (k == args->mod)
+                return;
+        if (k > args->mod) {
+                if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
+                        return;
+        } else {
+                if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
+                    (int)args->minlen)
+                        return;
+        }
+        ASSERT(rlen >= args->minlen);
+        ASSERT(rlen <= args->maxlen);
+        args->len = rlen;
+}
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_agf_t       *agf;           /* a.g. freelist header */
+        int             diff;           /* free space difference */
+        if (args->minleft == 0)
+                return 1;
+        agf = XFS_BUF_TO_AGF(args->agbp);
+        diff = INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+                + INT_GET(agf->agf_flcount, ARCH_CONVERT)
+                - args->len - args->minleft;
+        if (diff >= 0)
+                return 1;
+        args->len += diff;              /* shrink the allocated space */
+        if (args->len >= args->minlen)
+                return 1;
+        args->agbno = NULLAGBLOCK;
+        return 0;
+}
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int                              /* error code */
+xfs_alloc_fixup_trees(
+        xfs_btree_cur_t *cnt_cur,       /* cursor for by-size btree */
+        xfs_btree_cur_t *bno_cur,       /* cursor for by-block btree */
+        xfs_agblock_t   fbno,           /* starting block of free extent */
+        xfs_extlen_t    flen,           /* length of free extent */
+        xfs_agblock_t   rbno,           /* starting block of returned extent */
+        xfs_extlen_t    rlen,           /* length of returned extent */
+        int             flags)          /* flags, XFSA_FIXUP_... */
+{
+        int             error;          /* error code */
+        int             i;              /* operation results */
+        xfs_agblock_t   nfbno1;         /* first new free startblock */
+        xfs_agblock_t   nfbno2;         /* second new free startblock */
+        xfs_extlen_t    nflen1=0;       /* first new free length */
+        xfs_extlen_t    nflen2=0;       /* second new free length */
+        /*
+         * Look up the record in the by-size tree if necessary.
+         */
+        if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+                if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(
+                        i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+        } else {
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        /*
+         * Look up the record in the by-block tree if necessary.
+         */
+        if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+                if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(
+                        i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+        } else {
+                if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+#ifdef DEBUG
+        {
+                xfs_alloc_block_t       *bnoblock;
+                xfs_alloc_block_t       *cntblock;
+                if (bno_cur->bc_nlevels == 1 &&
+                    cnt_cur->bc_nlevels == 1) {
+                        bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
+                        cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
+                        XFS_WANT_CORRUPTED_RETURN(
+                                INT_GET(bnoblock->bb_numrecs, ARCH_CONVERT) == INT_GET(cntblock->bb_numrecs, ARCH_CONVERT));
+                }
+        }
+#endif
+        /*
+         * Deal with all four cases: the allocated record is contained
+         * within the freespace record, so we can have new freespace
+         * at either (or both) end, or no freespace remaining.
+         */
+        if (rbno == fbno && rlen == flen)
+                nfbno1 = nfbno2 = NULLAGBLOCK;
+        else if (rbno == fbno) {
+                nfbno1 = rbno + rlen;
+                nflen1 = flen - rlen;
+                nfbno2 = NULLAGBLOCK;
+        } else if (rbno + rlen == fbno + flen) {
+                nfbno1 = fbno;
+                nflen1 = flen - rlen;
+                nfbno2 = NULLAGBLOCK;
+        } else {
+                nfbno1 = fbno;
+                nflen1 = rbno - fbno;
+                nfbno2 = rbno + rlen;
+                nflen2 = (fbno + flen) - nfbno2;
+        }
+        /*
+         * Delete the entry from the by-size btree.
+         */
+        if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        /*
+         * Add new by-size btree entry(s).
+         */
+        if (nfbno1 != NULLAGBLOCK) {
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        if (nfbno2 != NULLAGBLOCK) {
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        /*
+         * Fix up the by-block btree entry(s).
+         */
+        if (nfbno1 == NULLAGBLOCK) {
+                /*
+                 * No remaining freespace, just delete the by-block tree entry.
+                 */
+                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        } else {
+                /*
+                 * Update the by-block entry to start later|be shorter.
+                 */
+                if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+                        return error;
+        }
+        if (nfbno2 != NULLAGBLOCK) {
+                /*
+                 * 2 resulting free entries, need to add one.
+                 */
+                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int                              /* error */
+xfs_alloc_read_agfl(
+        xfs_mount_t     *mp,            /* mount point structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_buf_t       **bpp)          /* buffer for the ag free block array */
+{
+        xfs_buf_t       *bp;            /* return value */
+        int             error;
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_trans_read_buf(
+                        mp, tp, mp->m_ddev_targp,
+                        XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+        if (error)
+                return error;
+        ASSERT(bp);
+        ASSERT(!XFS_BUF_GETERROR(bp));
+        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+        *bpp = bp;
+        return 0;
+}
+#if defined(XFS_ALLOC_TRACE)
+/*
+ * Add an allocation trace entry for an alloc call.
+ */
+STATIC void
+xfs_alloc_trace_alloc(
+        char            *name,          /* function tag string */
+        char            *str,           /* additional string */
+        xfs_alloc_arg_t *args,          /* allocation argument structure */
+        int             line)           /* source line number */
+{
+        ktrace_enter(xfs_alloc_trace_buf,
+                (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
+                (void *)name,
+                (void *)str,
+                (void *)args->mp,
+                (void *)(__psunsigned_t)args->agno,
+                (void *)(__psunsigned_t)args->agbno,
+                (void *)(__psunsigned_t)args->minlen,
+                (void *)(__psunsigned_t)args->maxlen,
+                (void *)(__psunsigned_t)args->mod,
+                (void *)(__psunsigned_t)args->prod,
+                (void *)(__psunsigned_t)args->minleft,
+                (void *)(__psunsigned_t)args->total,
+                (void *)(__psunsigned_t)args->alignment,
+                (void *)(__psunsigned_t)args->len,
+                (void *)((((__psint_t)args->type) << 16) |
+                         (__psint_t)args->otype),
+                (void *)(__psint_t)((args->wasdel << 3) |
+                                    (args->wasfromfl << 2) |
+                                    (args->isfl << 1) |
+                                    (args->userdata << 0)));
+}
+/*
+ * Add an allocation trace entry for a free call.
+ */
+STATIC void
+xfs_alloc_trace_free(
+        char            *name,          /* function tag string */
+        char            *str,           /* additional string */
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agblock_t   agbno,          /* a.g. relative block number */
+        xfs_extlen_t    len,            /* length of extent */
+        int             isfl,           /* set if is freelist allocation/free */
+        int             line)           /* source line number */
+{
+        ktrace_enter(xfs_alloc_trace_buf,
+                (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
+                (void *)name,
+                (void *)str,
+                (void *)mp,
+                (void *)(__psunsigned_t)agno,
+                (void *)(__psunsigned_t)agbno,
+                (void *)(__psunsigned_t)len,
+                (void *)(__psint_t)isfl,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+/*
+ * Add an allocation trace entry for modifying an agf.
+ */
+STATIC void
+xfs_alloc_trace_modagf(
+        char            *name,          /* function tag string */
+        char            *str,           /* additional string */
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_agf_t       *agf,           /* new agf value */
+        int             flags,          /* logging flags for agf */
+        int             line)           /* source line number */
+{
+        ktrace_enter(xfs_alloc_trace_buf,
+                (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
+                (void *)name,
+                (void *)str,
+                (void *)mp,
+                (void *)(__psint_t)flags,
+                (void *)(__psunsigned_t)INT_GET(agf->agf_seqno, ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_length, ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_BNO],
+                                                ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_CNT],
+                                                ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_BNO],
+                                                ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_CNT],
+                                                ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_flfirst, ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_fllast, ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_flcount, ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_freeblks, ARCH_CONVERT),
+                (void *)(__psunsigned_t)INT_GET(agf->agf_longest, ARCH_CONVERT));
+}
+STATIC void
+xfs_alloc_trace_busy(
+        char            *name,          /* function tag string */
+        char            *str,           /* additional string */
+        xfs_mount_t     *mp,            /* file system mount poing */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agblock_t   agbno,          /* a.g. relative block number */
+        xfs_extlen_t    len,            /* length of extent */
+        int             slot,           /* perag Busy slot */
+        xfs_trans_t     *tp,
+        int             trtype,         /* type: add, delete, search */
+        int             line)           /* source line number */
+{
+        ktrace_enter(xfs_alloc_trace_buf,
+                (void *)(__psint_t)(trtype | (line << 16)),
+                (void *)name,
+                (void *)str,
+                (void *)mp,
+                (void *)(__psunsigned_t)agno,
+                (void *)(__psunsigned_t)agbno,
+                (void *)(__psunsigned_t)len,
+                (void *)(__psint_t)slot,
+                (void *)tp,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+#endif  /* XFS_ALLOC_TRACE */
+/*
+ * Allocation group level functions.
+ */
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                      /* error */
+xfs_alloc_ag_vextent(
+        xfs_alloc_arg_t *args)  /* argument structure for allocation */
+{
+        int             error=0;
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_alloc_ag_vextent";
+#endif
+        ASSERT(args->minlen > 0);
+        ASSERT(args->maxlen > 0);
+        ASSERT(args->minlen <= args->maxlen);
+        ASSERT(args->mod < args->prod);
+        ASSERT(args->alignment > 0);
+        /*
+         * Branch to correct routine based on the type.
+         */
+        args->wasfromfl = 0;
+        switch (args->type) {
+        case XFS_ALLOCTYPE_THIS_AG:
+                error = xfs_alloc_ag_vextent_size(args);
+                break;
+        case XFS_ALLOCTYPE_NEAR_BNO:
+                error = xfs_alloc_ag_vextent_near(args);
+                break;
+        case XFS_ALLOCTYPE_THIS_BNO:
+                error = xfs_alloc_ag_vextent_exact(args);
+                break;
+        default:
+                ASSERT(0);
+                /* NOTREACHED */
+        }
+        if (error)
+                return error;
+        /*
+         * If the allocation worked, need to change the agf structure
+         * (and log it), and the superblock.
+         */
+        if (args->agbno != NULLAGBLOCK) {
+                xfs_agf_t       *agf;   /* allocation group freelist header */
+#ifdef XFS_ALLOC_TRACE
+                xfs_mount_t     *mp = args->mp;
+#endif
+                long            slen = (long)args->len;
+                ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
+                ASSERT(!(args->wasfromfl) || !args->isfl);
+                ASSERT(args->agbno % args->alignment == 0);
+                if (!(args->wasfromfl)) {
+                        agf = XFS_BUF_TO_AGF(args->agbp);
+                        INT_MOD(agf->agf_freeblks, ARCH_CONVERT, -(args->len));
+                        xfs_trans_agblocks_delta(args->tp,
+                                                 -((long)(args->len)));
+                        args->pag->pagf_freeblks -= args->len;
+                        ASSERT(INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+                                <= INT_GET(agf->agf_length, ARCH_CONVERT));
+                        TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
+                        xfs_alloc_log_agf(args->tp, args->agbp,
+                                                XFS_AGF_FREEBLKS);
+                        /* search the busylist for these blocks */
+                        xfs_alloc_search_busy(args->tp, args->agno,
+                                        args->agbno, args->len);
+                }
+                if (!args->isfl)
+                        xfs_trans_mod_sb(args->tp,
+                                args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+                                        XFS_TRANS_SB_FDBLOCKS, -slen);
+                XFS_STATS_INC(xs_allocx);
+                XFS_STATS_ADD(xs_allocb, args->len);
+        }
+        return 0;
+}
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                      /* error */
+xfs_alloc_ag_vextent_exact(
+        xfs_alloc_arg_t *args)  /* allocation argument structure */
+{
+        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
+        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+        xfs_agblock_t   end;    /* end of allocated extent */
+        int             error;
+        xfs_agblock_t   fbno;   /* start block of found extent */
+        xfs_agblock_t   fend;   /* end block of found extent */
+        xfs_extlen_t    flen;   /* length of found extent */
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_alloc_ag_vextent_exact";
+#endif
+        int             i;      /* success/failure of operation */
+        xfs_agblock_t   maxend; /* end of maximal extent */
+        xfs_agblock_t   minend; /* end of minimal extent */
+        xfs_extlen_t    rlen;   /* length of returned extent */
+        ASSERT(args->alignment == 1);
+        /*
+         * Allocate/initialize a cursor for the by-number freespace btree.
+         */
+        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_BNO, NULL, 0);
+        /*
+         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+         * Look for the closest free block <= bno, it must contain bno
+         * if any free block does.
+         */
+        if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+                goto error0;
+        if (!i) {
+                /*
+                 * Didn't find it, return null.
+                 */
+                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                args->agbno = NULLAGBLOCK;
+                return 0;
+        }
+        /*
+         * Grab the freespace record.
+         */
+        if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        ASSERT(fbno <= args->agbno);
+        minend = args->agbno + args->minlen;
+        maxend = args->agbno + args->maxlen;
+        fend = fbno + flen;
+        /*
+         * Give up if the freespace isn't long enough for the minimum request.
+         */
+        if (fend < minend) {
+                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                args->agbno = NULLAGBLOCK;
+                return 0;
+        }
+        /*
+         * End of extent will be smaller of the freespace end and the
+         * maximal requested end.
+         */
+        end = XFS_AGBLOCK_MIN(fend, maxend);
+        /*
+         * Fix the length according to mod and prod if given.
+         */
+        args->len = end - args->agbno;
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args)) {
+                xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+                return 0;
+        }
+        rlen = args->len;
+        ASSERT(args->agbno + rlen <= fend);
+        end = args->agbno + rlen;
+        /*
+         * We are allocating agbno for rlen [agbno .. end]
+         * Allocate/initialize a cursor for the by-size btree.
+         */
+        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_CNT, NULL, 0);
+        ASSERT(args->agbno + args->len <=
+                INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+                        ARCH_CONVERT));
+        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+                        args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+                goto error0;
+        }
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        TRACE_ALLOC("normal", args);
+        args->wasfromfl = 0;
+        return 0;
+error0:
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+        TRACE_ALLOC("error", args);
+        return error;
+}
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                              /* error */
+xfs_alloc_ag_vextent_near(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_btree_cur_t *bno_cur_gt;    /* cursor for bno btree, right side */
+        xfs_btree_cur_t *bno_cur_lt;    /* cursor for bno btree, left side */
+        xfs_btree_cur_t *cnt_cur;       /* cursor for count btree */
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_alloc_ag_vextent_near";
+#endif
+        xfs_agblock_t   gtbno;          /* start bno of right side entry */
+        xfs_agblock_t   gtbnoa;         /* aligned ... */
+        xfs_extlen_t    gtdiff;         /* difference to right side entry */
+        xfs_extlen_t    gtlen;          /* length of right side entry */
+        xfs_extlen_t    gtlena;         /* aligned ... */
+        xfs_agblock_t   gtnew;          /* useful start bno of right side */
+        int             error;          /* error code */
+        int             i;              /* result code, temporary */
+        int             j;              /* result code, temporary */
+        xfs_agblock_t   ltbno;          /* start bno of left side entry */
+        xfs_agblock_t   ltbnoa;         /* aligned ... */
+        xfs_extlen_t    ltdiff;         /* difference to left side entry */
+        /*REFERENCED*/
+        xfs_agblock_t   ltend;          /* end bno of left side entry */
+        xfs_extlen_t    ltlen;          /* length of left side entry */
+        xfs_extlen_t    ltlena;         /* aligned ... */
+        xfs_agblock_t   ltnew;          /* useful start bno of left side */
+        xfs_extlen_t    rlen;           /* length of returned extent */
+#if defined(DEBUG) && defined(__KERNEL__)
+        /*
+         * Randomly don't execute the first algorithm.
+         */
+        int             dofirst;        /* set to do first algorithm */
+        dofirst = random() & 1;
+#endif
+        /*
+         * Get a cursor for the by-size btree.
+         */
+        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_CNT, NULL, 0);
+        ltlen = 0;
+        bno_cur_lt = bno_cur_gt = NULL;
+        /*
+         * See if there are any free extents as big as maxlen.
+         */
+        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+                goto error0;
+        /*
+         * If none, then pick up the last entry in the tree unless the
+         * tree is empty.
+         */
+        if (!i) {
+                if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+                                &ltlen, &i)))
+                        goto error0;
+                if (i == 0 || ltlen == 0) {
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        return 0;
+                }
+                ASSERT(i == 1);
+        }
+        args->wasfromfl = 0;
+        /*
+         * First algorithm.
+         * If the requested extent is large wrt the freespaces available
+         * in this a.g., then the cursor will be pointing to a btree entry
+         * near the right edge of the tree.  If it's in the last btree leaf
+         * block, then we just examine all the entries in that block
+         * that are big enough, and pick the best one.
+         * This is written as a while loop so we can break out of it,
+         * but we never loop back to the top.
+         */
+        while (xfs_btree_islastblock(cnt_cur, 0)) {
+                xfs_extlen_t    bdiff;
+                int             besti=0;
+                xfs_extlen_t    blen=0;
+                xfs_agblock_t   bnew=0;
+#if defined(DEBUG) && defined(__KERNEL__)
+                if (!dofirst)
+                        break;
+#endif
+                /*
+                 * Start from the entry that lookup found, sequence through
+                 * all larger free blocks.  If we're actually pointing at a
+                 * record smaller than maxlen, go to the start of this block,
+                 * and skip all those smaller than minlen.
+                 */
+                if (ltlen || args->alignment > 1) {
+                        cnt_cur->bc_ptrs[0] = 1;
+                        do {
+                                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+                                                &ltlen, &i)))
+                                        goto error0;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                                if (ltlen >= args->minlen)
+                                        break;
+                                if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+                                        goto error0;
+                        } while (i);
+                        ASSERT(ltlen >= args->minlen);
+                        if (!i)
+                                break;
+                }
+                i = cnt_cur->bc_ptrs[0];
+                for (j = 1, blen = 0, bdiff = 0;
+                     !error && j && (blen < args->maxlen || bdiff > 0);
+                     error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+                        /*
+                         * For each entry, decide if it's better than
+                         * the previous best entry.
+                         */
+                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        if (!xfs_alloc_compute_aligned(ltbno, ltlen,
+                                        args->alignment, args->minlen,
+                                        &ltbnoa, &ltlena))
+                                continue;
+                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        ASSERT(args->len >= args->minlen);
+                        if (args->len < blen)
+                                continue;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+                                args->alignment, ltbno, ltlen, &ltnew);
+                        if (ltnew != NULLAGBLOCK &&
+                            (args->len > blen || ltdiff < bdiff)) {
+                                bdiff = ltdiff;
+                                bnew = ltnew;
+                                blen = args->len;
+                                besti = cnt_cur->bc_ptrs[0];
+                        }
+                }
+                /*
+                 * It didn't work.  We COULD be in a case where
+                 * there's a good record somewhere, so try again.
+                 */
+                if (blen == 0)
+                        break;
+                /*
+                 * Point at the best entry, and retrieve it again.
+                 */
+                cnt_cur->bc_ptrs[0] = besti;
+                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                ltend = ltbno + ltlen;
+                ASSERT(ltend <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+                                ARCH_CONVERT));
+                args->len = blen;
+                if (!xfs_alloc_fix_minleft(args)) {
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        TRACE_ALLOC("nominleft", args);
+                        return 0;
+                }
+                blen = args->len;
+                /*
+                 * We are allocating starting at bnew for blen blocks.
+                 */
+                args->agbno = bnew;
+                ASSERT(bnew >= ltbno);
+                ASSERT(bnew + blen <= ltend);
+                /*
+                 * Set up a cursor for the by-bno tree.
+                 */
+                bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
+                        args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+                /*
+                 * Fix up the btree entries.
+                 */
+                if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+                                ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+                        goto error0;
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+                TRACE_ALLOC("first", args);
+                return 0;
+        }
+        /*
+         * Second algorithm.
+         * Search in the by-bno tree to the left and to the right
+         * simultaneously, until in each case we find a space big enough,
+         * or run into the edge of the tree.  When we run into the edge,
+         * we deallocate that cursor.
+         * If both searches succeed, we compare the two spaces and pick
+         * the better one.
+         * With alignment, it's possible for both to fail; the upper
+         * level algorithm that picks allocation groups for allocations
+         * is not supposed to do this.
+         */
+        /*
+         * Allocate and initialize the cursor for the leftward search.
+         */
+        bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_BNO, NULL, 0);
+        /*
+         * Lookup <= bno to find the leftward search's starting point.
+         */
+        if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+                goto error0;
+        if (!i) {
+                /*
+                 * Didn't find anything; use this cursor for the rightward
+                 * search.
+                 */
+                bno_cur_gt = bno_cur_lt;
+                bno_cur_lt = NULL;
+        }
+        /*
+         * Found something.  Duplicate the cursor for the rightward search.
+         */
+        else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+                goto error0;
+        /*
+         * Increment the cursor, so we will point at the entry just right
+         * of the leftward entry if any, or to the leftmost entry.
+         */
+        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+                goto error0;
+        if (!i) {
+                /*
+                 * It failed, there are no rightward entries.
+                 */
+                xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+                bno_cur_gt = NULL;
+        }
+        /*
+         * Loop going left with the leftward cursor, right with the
+         * rightward cursor, until either both directions give up or
+         * we find an entry at least as big as minlen.
+         */
+        do {
+                if (bno_cur_lt) {
+                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        if (xfs_alloc_compute_aligned(ltbno, ltlen,
+                                        args->alignment, args->minlen,
+                                        &ltbnoa, &ltlena))
+                                break;
+                        if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+                                goto error0;
+                        if (!i) {
+                                xfs_btree_del_cursor(bno_cur_lt,
+                                                     XFS_BTREE_NOERROR);
+                                bno_cur_lt = NULL;
+                        }
+                }
+                if (bno_cur_gt) {
+                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        if (xfs_alloc_compute_aligned(gtbno, gtlen,
+                                        args->alignment, args->minlen,
+                                        &gtbnoa, &gtlena))
+                                break;
+                        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+                                goto error0;
+                        if (!i) {
+                                xfs_btree_del_cursor(bno_cur_gt,
+                                                     XFS_BTREE_NOERROR);
+                                bno_cur_gt = NULL;
+                        }
+                }
+        } while (bno_cur_lt || bno_cur_gt);
+        /*
+         * Got both cursors still active, need to find better entry.
+         */
+        if (bno_cur_lt && bno_cur_gt) {
+                /*
+                 * Left side is long enough, look for a right side entry.
+                 */
+                if (ltlena >= args->minlen) {
+                        /*
+                         * Fix up the length.
+                         */
+                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        rlen = args->len;
+                        ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+                                args->alignment, ltbno, ltlen, &ltnew);
+                        /*
+                         * Not perfect.
+                         */
+                        if (ltdiff) {
+                                /*
+                                 * Look until we find a better one, run out of
+                                 * space, or run off the end.
+                                 */
+                                while (bno_cur_lt && bno_cur_gt) {
+                                        if ((error = xfs_alloc_get_rec(
+                                                        bno_cur_gt, &gtbno,
+                                                        &gtlen, &i)))
+                                                goto error0;
+                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                                        xfs_alloc_compute_aligned(gtbno, gtlen,
+                                                args->alignment, args->minlen,
+                                                &gtbnoa, &gtlena);
+                                        /*
+                                         * The left one is clearly better.
+                                         */
+                                        if (gtbnoa >= args->agbno + ltdiff) {
+                                                xfs_btree_del_cursor(
+                                                        bno_cur_gt,
+                                                        XFS_BTREE_NOERROR);
+                                                bno_cur_gt = NULL;
+                                                break;
+                                        }
+                                        /*
+                                         * If we reach a big enough entry,
+                                         * compare the two and pick the best.
+                                         */
+                                        if (gtlena >= args->minlen) {
+                                                args->len =
+                                                        XFS_EXTLEN_MIN(gtlena,
+                                                                args->maxlen);
+                                                xfs_alloc_fix_len(args);
+                                                rlen = args->len;
+                                                gtdiff = xfs_alloc_compute_diff(
+                                                        args->agbno, rlen,
+                                                        args->alignment,
+                                                        gtbno, gtlen, &gtnew);
+                                                /*
+                                                 * Right side is better.
+                                                 */
+                                                if (gtdiff < ltdiff) {
+                                                        xfs_btree_del_cursor(
+                                                                bno_cur_lt,
+                                                                XFS_BTREE_NOERROR);
+                                                        bno_cur_lt = NULL;
+                                                }
+                                                /*
+                                                 * Left side is better.
+                                                 */
+                                                else {
+                                                        xfs_btree_del_cursor(
+                                                                bno_cur_gt,
+                                                                XFS_BTREE_NOERROR);
+                                                        bno_cur_gt = NULL;
+                                                }
+                                                break;
+                                        }
+                                        /*
+                                         * Fell off the right end.
+                                         */
+                                        if ((error = xfs_alloc_increment(
+                                                        bno_cur_gt, 0, &i)))
+                                                goto error0;
+                                        if (!i) {
+                                                xfs_btree_del_cursor(
+                                                        bno_cur_gt,
+                                                        XFS_BTREE_NOERROR);
+                                                bno_cur_gt = NULL;
+                                                break;
+                                        }
+                                }
+                        }
+                        /*
+                         * The left side is perfect, trash the right side.
+                         */
+                        else {
+                                xfs_btree_del_cursor(bno_cur_gt,
+                                                     XFS_BTREE_NOERROR);
+                                bno_cur_gt = NULL;
+                        }
+                }
+                /*
+                 * It's the right side that was found first, look left.
+                 */
+                else {
+                        /*
+                         * Fix up the length.
+                         */
+                        args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+                        xfs_alloc_fix_len(args);
+                        rlen = args->len;
+                        gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+                                args->alignment, gtbno, gtlen, &gtnew);
+                        /*
+                         * Right side entry isn't perfect.
+                         */
+                        if (gtdiff) {
+                                /*
+                                 * Look until we find a better one, run out of
+                                 * space, or run off the end.
+                                 */
+                                while (bno_cur_lt && bno_cur_gt) {
+                                        if ((error = xfs_alloc_get_rec(
+                                                        bno_cur_lt, &ltbno,
+                                                        &ltlen, &i)))
+                                                goto error0;
+                                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                                        xfs_alloc_compute_aligned(ltbno, ltlen,
+                                                args->alignment, args->minlen,
+                                                &ltbnoa, &ltlena);
+                                        /*
+                                         * The right one is clearly better.
+                                         */
+                                        if (ltbnoa <= args->agbno - gtdiff) {
+                                                xfs_btree_del_cursor(
+                                                        bno_cur_lt,
+                                                        XFS_BTREE_NOERROR);
+                                                bno_cur_lt = NULL;
+                                                break;
+                                        }
+                                        /*
+                                         * If we reach a big enough entry,
+                                         * compare the two and pick the best.
+                                         */
+                                        if (ltlena >= args->minlen) {
+                                                args->len = XFS_EXTLEN_MIN(
+                                                        ltlena, args->maxlen);
+                                                xfs_alloc_fix_len(args);
+                                                rlen = args->len;
+                                                ltdiff = xfs_alloc_compute_diff(
+                                                        args->agbno, rlen,
+                                                        args->alignment,
+                                                        ltbno, ltlen, &ltnew);
+                                                /*
+                                                 * Left side is better.
+                                                 */
+                                                if (ltdiff < gtdiff) {
+                                                        xfs_btree_del_cursor(
+                                                                bno_cur_gt,
+                                                                XFS_BTREE_NOERROR);
+                                                        bno_cur_gt = NULL;
+                                                }
+                                                /*
+                                                 * Right side is better.
+                                                 */
+                                                else {
+                                                        xfs_btree_del_cursor(
+                                                                bno_cur_lt,
+                                                                XFS_BTREE_NOERROR);
+                                                        bno_cur_lt = NULL;
+                                                }
+                                                break;
+                                        }
+                                        /*
+                                         * Fell off the left end.
+                                         */
+                                        if ((error = xfs_alloc_decrement(
+                                                        bno_cur_lt, 0, &i)))
+                                                goto error0;
+                                        if (!i) {
+                                                xfs_btree_del_cursor(bno_cur_lt,
+                                                        XFS_BTREE_NOERROR);
+                                                bno_cur_lt = NULL;
+                                                break;
+                                        }
+                                }
+                        }
+                        /*
+                         * The right side is perfect, trash the left side.
+                         */
+                        else {
+                                xfs_btree_del_cursor(bno_cur_lt,
+                                        XFS_BTREE_NOERROR);
+                                bno_cur_lt = NULL;
+                        }
+                }
+        }
+        /*
+         * If we couldn't get anything, give up.
+         */
+        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+                TRACE_ALLOC("neither", args);
+                args->agbno = NULLAGBLOCK;
+                return 0;
+        }
+        /*
+         * At this point we have selected a freespace entry, either to the
+         * left or to the right.  If it's on the right, copy all the
+         * useful variables to the "left" set so we only have one
+         * copy of this code.
+         */
+        if (bno_cur_gt) {
+                bno_cur_lt = bno_cur_gt;
+                bno_cur_gt = NULL;
+                ltbno = gtbno;
+                ltbnoa = gtbnoa;
+                ltlen = gtlen;
+                ltlena = gtlena;
+                j = 1;
+        } else
+                j = 0;
+        /*
+         * Fix up the length and compute the useful address.
+         */
+        ltend = ltbno + ltlen;
+        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+        xfs_alloc_fix_len(args);
+        if (!xfs_alloc_fix_minleft(args)) {
+                TRACE_ALLOC("nominleft", args);
+                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                return 0;
+        }
+        rlen = args->len;
+        (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
+                ltlen, &ltnew);
+        ASSERT(ltnew >= ltbno);
+        ASSERT(ltnew + rlen <= ltend);
+        ASSERT(ltnew + rlen <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+                ARCH_CONVERT));
+        args->agbno = ltnew;
+        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+                        ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+                goto error0;
+        TRACE_ALLOC(j ? "gt" : "lt", args);
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+        return 0;
+ error0:
+        TRACE_ALLOC("error", args);
+        if (cnt_cur != NULL)
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+        if (bno_cur_lt != NULL)
+                xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+        if (bno_cur_gt != NULL)
+                xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int                              /* error */
+xfs_alloc_ag_vextent_size(
+        xfs_alloc_arg_t *args)          /* allocation argument structure */
+{
+        xfs_btree_cur_t *bno_cur;       /* cursor for bno btree */
+        xfs_btree_cur_t *cnt_cur;       /* cursor for cnt btree */
+        int             error;          /* error result */
+        xfs_agblock_t   fbno;           /* start of found freespace */
+        xfs_extlen_t    flen;           /* length of found freespace */
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_alloc_ag_vextent_size";
+#endif
+        int             i;              /* temp status variable */
+        xfs_agblock_t   rbno;           /* returned block number */
+        xfs_extlen_t    rlen;           /* length of returned extent */
+        /*
+         * Allocate and initialize a cursor for the by-size btree.
+         */
+        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_CNT, NULL, 0);
+        bno_cur = NULL;
+        /*
+         * Look for an entry >= maxlen+alignment-1 blocks.
+         */
+        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+                        args->maxlen + args->alignment - 1, &i)))
+                goto error0;
+        /*
+         * If none, then pick up the last entry in the tree unless the
+         * tree is empty.
+         */
+        if (!i) {
+                if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
+                                &flen, &i)))
+                        goto error0;
+                if (i == 0 || flen == 0) {
+                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                        TRACE_ALLOC("noentry", args);
+                        return 0;
+                }
+                ASSERT(i == 1);
+        }
+        /*
+         * There's a freespace as big as maxlen+alignment-1, get it.
+         */
+        else {
+                if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+        /*
+         * In the first case above, we got the last entry in the
+         * by-size btree.  Now we check to see if the space hits maxlen
+         * once aligned; if not, we search left for something better.
+         * This can't happen in the second case above.
+         */
+        xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
+                &rbno, &rlen);
+        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
+        if (rlen < args->maxlen) {
+                xfs_agblock_t   bestfbno;
+                xfs_extlen_t    bestflen;
+                xfs_agblock_t   bestrbno;
+                xfs_extlen_t    bestrlen;
+                bestrlen = rlen;
+                bestrbno = rbno;
+                bestflen = flen;
+                bestfbno = fbno;
+                for (;;) {
+                        if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+                                goto error0;
+                        if (i == 0)
+                                break;
+                        if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+                                        &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        if (flen < bestrlen)
+                                break;
+                        xfs_alloc_compute_aligned(fbno, flen, args->alignment,
+                                args->minlen, &rbno, &rlen);
+                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                                (rlen <= flen && rbno + rlen <= fbno + flen),
+                                error0);
+                        if (rlen > bestrlen) {
+                                bestrlen = rlen;
+                                bestrbno = rbno;
+                                bestflen = flen;
+                                bestfbno = fbno;
+                                if (rlen == args->maxlen)
+                                        break;
+                        }
+                }
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+                                &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                rlen = bestrlen;
+                rbno = bestrbno;
+                flen = bestflen;
+                fbno = bestfbno;
+        }
+        args->wasfromfl = 0;
+        /*
+         * Fix up the length.
+         */
+        args->len = rlen;
+        xfs_alloc_fix_len(args);
+        if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+                TRACE_ALLOC("nominleft", args);
+                args->agbno = NULLAGBLOCK;
+                return 0;
+        }
+        rlen = args->len;
+        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+        /*
+         * Allocate and initialize a cursor for the by-block tree.
+         */
+        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+                args->agno, XFS_BTNUM_BNO, NULL, 0);
+        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
+                goto error0;
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        cnt_cur = bno_cur = NULL;
+        args->len = rlen;
+        args->agbno = rbno;
+        XFS_WANT_CORRUPTED_GOTO(
+                args->agbno + args->len <=
+                        INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+                        ARCH_CONVERT),
+                error0);
+        TRACE_ALLOC("normal", args);
+        return 0;
+error0:
+        TRACE_ALLOC("error", args);
+        if (cnt_cur)
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+        if (bno_cur)
+                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int                      /* error */
+xfs_alloc_ag_vextent_small(
+        xfs_alloc_arg_t *args,  /* allocation argument structure */
+        xfs_btree_cur_t *ccur,  /* by-size cursor */
+        xfs_agblock_t   *fbnop, /* result block number */
+        xfs_extlen_t    *flenp, /* result length */
+        int             *stat)  /* status: 0-freelist, 1-normal/none */
+{
+        int             error;
+        xfs_agblock_t   fbno;
+        xfs_extlen_t    flen;
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_alloc_ag_vextent_small";
+#endif
+        int             i;
+        if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+                goto error0;
+        if (i) {
+                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+        /*
+         * Nothing in the btree, try the freelist.  Make sure
+         * to respect minleft even when pulling from the
+         * freelist.
+         */
+        else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+                 (INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_flcount,
+                        ARCH_CONVERT) > args->minleft)) {
+                if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno)))
+                        goto error0;
+                if (fbno != NULLAGBLOCK) {
+                        if (args->userdata) {
+                                xfs_buf_t       *bp;
+                                bp = xfs_btree_get_bufs(args->mp, args->tp,
+                                        args->agno, fbno, 0);
+                                xfs_trans_binval(args->tp, bp);
+                        }
+                        args->len = 1;
+                        args->agbno = fbno;
+                        XFS_WANT_CORRUPTED_GOTO(
+                                args->agbno + args->len <=
+                                INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+                                        ARCH_CONVERT),
+                                error0);
+                        args->wasfromfl = 1;
+                        TRACE_ALLOC("freelist", args);
+                        *stat = 0;
+                        return 0;
+                }
+                /*
+                 * Nothing in the freelist.
+                 */
+                else
+                        flen = 0;
+        }
+        /*
+         * Can't allocate from the freelist for some reason.
+         */
+        else
+                flen = 0;
+        /*
+         * Can't do the allocation, give up.
+         */
+        if (flen < args->minlen) {
+                args->agbno = NULLAGBLOCK;
+                TRACE_ALLOC("notenough", args);
+                flen = 0;
+        }
+        *fbnop = fbno;
+        *flenp = flen;
+        *stat = 1;
+        TRACE_ALLOC("normal", args);
+        return 0;
+error0:
+        TRACE_ALLOC("error", args);
+        return error;
+}
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int                      /* error */
+xfs_free_ag_extent(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_buf_t       *agbp,  /* buffer for a.g. freelist header */
+        xfs_agnumber_t  agno,   /* allocation group number */
+        xfs_agblock_t   bno,    /* starting block number */
+        xfs_extlen_t    len,    /* length of extent */
+        int             isfl)   /* set if is freelist blocks - no sb acctg */
+{
+        xfs_btree_cur_t *bno_cur;       /* cursor for by-block btree */
+        xfs_btree_cur_t *cnt_cur;       /* cursor for by-size btree */
+        int             error;          /* error return value */
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_free_ag_extent";
+#endif
+        xfs_agblock_t   gtbno;          /* start of right neighbor block */
+        xfs_extlen_t    gtlen;          /* length of right neighbor block */
+        int             haveleft;       /* have a left neighbor block */
+        int             haveright;      /* have a right neighbor block */
+        int             i;              /* temp, result code */
+        xfs_agblock_t   ltbno;          /* start of left neighbor block */
+        xfs_extlen_t    ltlen;          /* length of left neighbor block */
+        xfs_mount_t     *mp;            /* mount point struct for filesystem */
+        xfs_agblock_t   nbno;           /* new starting block of freespace */
+        xfs_extlen_t    nlen;           /* new length of freespace */
+        mp = tp->t_mountp;
+        /*
+         * Allocate and initialize a cursor for the by-block btree.
+         */
+        bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
+                0);
+        cnt_cur = NULL;
+        /*
+         * Look for a neighboring block on the left (lower block numbers)
+         * that is contiguous with this space.
+         */
+        if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+                goto error0;
+        if (haveleft) {
+                /*
+                 * There is a block to our left.
+                 */
+                if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * It's not contiguous, though.
+                 */
+                if (ltbno + ltlen < bno)
+                        haveleft = 0;
+                else {
+                        /*
+                         * If this failure happens the request to free this
+                         * space was invalid, it's (partly) already free.
+                         * Very bad.
+                         */
+                        XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+                }
+        }
+        /*
+         * Look for a neighboring block on the right (higher block numbers)
+         * that is contiguous with this space.
+         */
+        if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+                goto error0;
+        if (haveright) {
+                /*
+                 * There is a block to our right.
+                 */
+                if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * It's not contiguous, though.
+                 */
+                if (bno + len < gtbno)
+                        haveright = 0;
+                else {
+                        /*
+                         * If this failure happens the request to free this
+                         * space was invalid, it's (partly) already free.
+                         * Very bad.
+                         */
+                        XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+                }
+        }
+        /*
+         * Now allocate and initialize a cursor for the by-size tree.
+         */
+        cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
+                0);
+        /*
+         * Have both left and right contiguous neighbors.
+         * Merge all three into a single free block.
+         */
+        if (haveleft && haveright) {
+                /*
+                 * Delete the old by-size entry on the left.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Delete the old by-size entry on the right.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Delete the old by-block entry for the right block.
+                 */
+                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Move the by-block cursor back to the left neighbor.
+                 */
+                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+                /*
+                 * Check that this is the right record: delete didn't
+                 * mangle the cursor.
+                 */
+                {
+                        xfs_agblock_t   xxbno;
+                        xfs_extlen_t    xxlen;
+                        if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+                                        &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(
+                                i == 1 && xxbno == ltbno && xxlen == ltlen,
+                                error0);
+                }
+#endif
+                /*
+                 * Update remaining by-block entry to the new, joined block.
+                 */
+                nbno = ltbno;
+                nlen = len + ltlen + gtlen;
+                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                        goto error0;
+        }
+        /*
+         * Have only a left contiguous neighbor.
+         * Merge it together with the new freespace.
+         */
+        else if (haveleft) {
+                /*
+                 * Delete the old by-size entry on the left.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Back up the by-block cursor to the left neighbor, and
+                 * update its length.
+                 */
+                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                nbno = ltbno;
+                nlen = len + ltlen;
+                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                        goto error0;
+        }
+        /*
+         * Have only a right contiguous neighbor.
+         * Merge it together with the new freespace.
+         */
+        else if (haveright) {
+                /*
+                 * Delete the old by-size entry on the right.
+                 */
+                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Update the starting block and length of the right
+                 * neighbor in the by-block tree.
+                 */
+                nbno = bno;
+                nlen = len + gtlen;
+                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+                        goto error0;
+        }
+        /*
+         * No contiguous neighbors.
+         * Insert the new freespace into the by-block tree.
+         */
+        else {
+                nbno = bno;
+                nlen = len;
+                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+        bno_cur = NULL;
+        /*
+         * In all cases we need to insert the new freespace in the by-size tree.
+         */
+        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+        if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+        cnt_cur = NULL;
+        /*
+         * Update the freespace totals in the ag and superblock.
+         */
+        {
+                xfs_agf_t       *agf;
+                xfs_perag_t     *pag;           /* per allocation group data */
+                agf = XFS_BUF_TO_AGF(agbp);
+                pag = &mp->m_perag[agno];
+                INT_MOD(agf->agf_freeblks, ARCH_CONVERT, len);
+                xfs_trans_agblocks_delta(tp, len);
+                pag->pagf_freeblks += len;
+                XFS_WANT_CORRUPTED_GOTO(
+                        INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+                                <= INT_GET(agf->agf_length, ARCH_CONVERT),
+                        error0);
+                TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
+                xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+                if (!isfl)
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+                XFS_STATS_INC(xs_freex);
+                XFS_STATS_ADD(xs_freeb, len);
+        }
+        TRACE_FREE(haveleft ?
+                        (haveright ? "both" : "left") :
+                        (haveright ? "right" : "none"),
+                agno, bno, len, isfl);
+        /*
+         * Since blocks move to the free list without the coordination
+         * used in xfs_bmap_finish, we can't allow block to be available
+         * for reallocation and non-transaction writing (user data)
+         * until we know that the transaction that moved it to the free
+         * list is permanently on disk.  We track the blocks by declaring
+         * these blocks as "busy"; the busy list is maintained on a per-ag
+         * basis and each transaction records which entries should be removed
+         * when the iclog commits to disk.  If a busy block is allocated,
+         * the iclog is pushed up to the LSN that freed the block.
+         */
+        xfs_alloc_mark_busy(tp, agno, bno, len);
+        return 0;
+ error0:
+        TRACE_FREE("error", agno, bno, len, isfl);
+        if (bno_cur)
+                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+        if (cnt_cur)
+                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+        xfs_mount_t     *mp)    /* file system mount structure */
+{
+        int             level;
+        uint            maxblocks;
+        uint            maxleafents;
+        int             minleafrecs;
+        int             minnoderecs;
+        maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+        minleafrecs = mp->m_alloc_mnr[0];
+        minnoderecs = mp->m_alloc_mnr[1];
+        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+        for (level = 1; maxblocks > 1; level++)
+                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+        mp->m_ag_maxlevels = level;
+}
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ */
+STATIC int                      /* error */
+xfs_alloc_fix_freelist(
+        xfs_alloc_arg_t *args,  /* allocation argument structure */
+        int             flags)  /* XFS_ALLOC_FLAG_... */
+{
+        xfs_buf_t       *agbp;  /* agf buffer pointer */
+        xfs_agf_t       *agf;   /* a.g. freespace structure pointer */
+        xfs_buf_t       *agflbp;/* agfl buffer pointer */
+        xfs_agblock_t   bno;    /* freelist block */
+        xfs_extlen_t    delta;  /* new blocks needed in freelist */
+        int             error;  /* error result code */
+        xfs_extlen_t    longest;/* longest extent in allocation group */
+        xfs_mount_t     *mp;    /* file system mount point structure */
+        xfs_extlen_t    need;   /* total blocks needed in freelist */
+        xfs_perag_t     *pag;   /* per-ag information structure */
+        xfs_alloc_arg_t targs;  /* local allocation arguments */
+        xfs_trans_t     *tp;    /* transaction pointer */
+        mp = args->mp;
+        pag = args->pag;
+        tp = args->tp;
+        if (!pag->pagf_init) {
+                if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+                                &agbp)))
+                        return error;
+                if (!pag->pagf_init) {
+                        args->agbp = NULL;
+                        return 0;
+                }
+        } else
+                agbp = NULL;
+        /* If this is a metadata prefered pag and we are user data
+         * then try somewhere else if we are not being asked to
+         * try harder at this point
+         */
+        if (pag->pagf_metadata && args->userdata && flags) {
+                args->agbp = NULL;
+                return 0;
+        }
+        need = XFS_MIN_FREELIST_PAG(pag, mp);
+        delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
+        /*
+         * If it looks like there isn't a long enough extent, or enough
+         * total blocks, reject it.
+         */
+        longest = (pag->pagf_longest > delta) ?
+                (pag->pagf_longest - delta) :
+                (pag->pagf_flcount > 0 || pag->pagf_longest > 0);
+        if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
+            (args->minleft &&
+             (int)(pag->pagf_freeblks + pag->pagf_flcount -
+                   need - args->total) <
+             (int)args->minleft)) {
+                if (agbp)
+                        xfs_trans_brelse(tp, agbp);
+                args->agbp = NULL;
+                return 0;
+        }
+        /*
+         * Get the a.g. freespace buffer.
+         * Can fail if we're not blocking on locks, and it's held.
+         */
+        if (agbp == NULL) {
+                if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+                                &agbp)))
+                        return error;
+                if (agbp == NULL) {
+                        args->agbp = NULL;
+                        return 0;
+                }
+        }
+        /*
+         * Figure out how many blocks we should have in the freelist.
+         */
+        agf = XFS_BUF_TO_AGF(agbp);
+        need = XFS_MIN_FREELIST(agf, mp);
+        delta = need > INT_GET(agf->agf_flcount, ARCH_CONVERT) ?
+                (need - INT_GET(agf->agf_flcount, ARCH_CONVERT)) : 0;
+        /*
+         * If there isn't enough total or single-extent, reject it.
+         */
+        longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
+        longest = (longest > delta) ? (longest - delta) :
+                (INT_GET(agf->agf_flcount, ARCH_CONVERT) > 0 || longest > 0);
+        if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
+             (args->minleft &&
+                (int)(INT_GET(agf->agf_freeblks, ARCH_CONVERT) +
+                   INT_GET(agf->agf_flcount, ARCH_CONVERT) - need - args->total) <
+             (int)args->minleft)) {
+                xfs_trans_brelse(tp, agbp);
+                args->agbp = NULL;
+                return 0;
+        }
+        /*
+         * Make the freelist shorter if it's too long.
+         */
+        while (INT_GET(agf->agf_flcount, ARCH_CONVERT) > need) {
+                xfs_buf_t       *bp;
+                if ((error = xfs_alloc_get_freelist(tp, agbp, &bno)))
+                        return error;
+                if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+                        return error;
+                bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+                xfs_trans_binval(tp, bp);
+        }
+        /*
+         * Initialize the args structure.
+         */
+        targs.tp = tp;
+        targs.mp = mp;
+        targs.agbp = agbp;
+        targs.agno = args->agno;
+        targs.mod = targs.minleft = targs.wasdel = targs.userdata =
+                targs.minalignslop = 0;
+        targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+        targs.type = XFS_ALLOCTYPE_THIS_AG;
+        targs.pag = pag;
+        if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+                return error;
+        /*
+         * Make the freelist longer if it's too short.
+         */
+        while (INT_GET(agf->agf_flcount, ARCH_CONVERT) < need) {
+                targs.agbno = 0;
+                targs.maxlen = need - INT_GET(agf->agf_flcount, ARCH_CONVERT);
+                /*
+                 * Allocate as many blocks as possible at once.
+                 */
+                if ((error = xfs_alloc_ag_vextent(&targs)))
+                        return error;
+                /*
+                 * Stop if we run out.  Won't happen if callers are obeying
+                 * the restrictions correctly.  Can happen for free calls
+                 * on a completely full ag.
+                 */
+                if (targs.agbno == NULLAGBLOCK)
+                        break;
+                /*
+                 * Put each allocated block on the list.
+                 */
+                for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+                        if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp,
+                                        bno)))
+                                return error;
+                }
+        }
+        args->agbp = agbp;
+        return 0;
+}
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int                             /* error */
+xfs_alloc_get_freelist(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_buf_t       *agbp,  /* buffer containing the agf structure */
+        xfs_agblock_t   *bnop)  /* block address retrieved from freelist */
+{
+        xfs_agf_t       *agf;   /* a.g. freespace structure */
+        xfs_agfl_t      *agfl;  /* a.g. freelist structure */
+        xfs_buf_t       *agflbp;/* buffer for a.g. freelist structure */
+        xfs_agblock_t   bno;    /* block number returned */
+        int             error;
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_alloc_get_freelist";
+#endif
+        xfs_mount_t     *mp;    /* mount structure */
+        xfs_perag_t     *pag;   /* per allocation group data */
+        agf = XFS_BUF_TO_AGF(agbp);
+        /*
+         * Freelist is empty, give up.
+         */
+        if (!agf->agf_flcount) {
+                *bnop = NULLAGBLOCK;
+                return 0;
+        }
+        /*
+         * Read the array of free blocks.
+         */
+        mp = tp->t_mountp;
+        if ((error = xfs_alloc_read_agfl(mp, tp,
+                        INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
+                return error;
+        agfl = XFS_BUF_TO_AGFL(agflbp);
+        /*
+         * Get the block number and update the data structures.
+         */
+        bno = INT_GET(agfl->agfl_bno[INT_GET(agf->agf_flfirst, ARCH_CONVERT)], ARCH_CONVERT);
+        INT_MOD(agf->agf_flfirst, ARCH_CONVERT, 1);
+        xfs_trans_brelse(tp, agflbp);
+        if (INT_GET(agf->agf_flfirst, ARCH_CONVERT) == XFS_AGFL_SIZE(mp))
+                agf->agf_flfirst = 0;
+        pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
+        INT_MOD(agf->agf_flcount, ARCH_CONVERT, -1);
+        xfs_trans_agflist_delta(tp, -1);
+        pag->pagf_flcount--;
+        TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+        *bnop = bno;
+        /*
+         * As blocks are freed, they are added to the per-ag busy list
+         * and remain there until the freeing transaction is committed to
+         * disk.  Now that we have allocated blocks, this list must be
+         * searched to see if a block is being reused.  If one is, then
+         * the freeing transaction must be pushed to disk NOW by forcing
+         * to disk all iclogs up that transaction's LSN.
+         */
+        xfs_alloc_search_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
+        return 0;
+}
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_buf_t       *bp,    /* buffer for a.g. freelist header */
+        int             fields) /* mask of fields to be logged (XFS_AGF_...) */
+{
+        int     first;          /* first byte offset */
+        int     last;           /* last byte offset */
+        static const short      offsets[] = {
+                offsetof(xfs_agf_t, agf_magicnum),
+                offsetof(xfs_agf_t, agf_versionnum),
+                offsetof(xfs_agf_t, agf_seqno),
+                offsetof(xfs_agf_t, agf_length),
+                offsetof(xfs_agf_t, agf_roots[0]),
+                offsetof(xfs_agf_t, agf_levels[0]),
+                offsetof(xfs_agf_t, agf_flfirst),
+                offsetof(xfs_agf_t, agf_fllast),
+                offsetof(xfs_agf_t, agf_flcount),
+                offsetof(xfs_agf_t, agf_freeblks),
+                offsetof(xfs_agf_t, agf_longest),
+                sizeof(xfs_agf_t)
+        };
+        xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+        xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int                                     /* error */
+xfs_alloc_pagf_init(
+        xfs_mount_t             *mp,    /* file system mount structure */
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        int                     flags)  /* XFS_ALLOC_FLAGS_... */
+{
+        xfs_buf_t               *bp;
+        int                     error;
+        if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+                return error;
+        if (bp)
+                xfs_trans_brelse(tp, bp);
+        return 0;
+}
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int                                     /* error */
+xfs_alloc_put_freelist(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_buf_t               *agbp,  /* buffer for a.g. freelist header */
+        xfs_buf_t               *agflbp,/* buffer for a.g. free block array */
+        xfs_agblock_t           bno)    /* block being freed */
+{
+        xfs_agf_t               *agf;   /* a.g. freespace structure */
+        xfs_agfl_t              *agfl;  /* a.g. free block array */
+        xfs_agblock_t           *blockp;/* pointer to array entry */
+        int                     error;
+#ifdef XFS_ALLOC_TRACE
+        static char             fname[] = "xfs_alloc_put_freelist";
+#endif
+        xfs_mount_t             *mp;    /* mount structure */
+        xfs_perag_t             *pag;   /* per allocation group data */
+        agf = XFS_BUF_TO_AGF(agbp);
+        mp = tp->t_mountp;
+        if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+                        INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
+                return error;
+        agfl = XFS_BUF_TO_AGFL(agflbp);
+        INT_MOD(agf->agf_fllast, ARCH_CONVERT, 1);
+        if (INT_GET(agf->agf_fllast, ARCH_CONVERT) == XFS_AGFL_SIZE(mp))
+                agf->agf_fllast = 0;
+        pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
+        INT_MOD(agf->agf_flcount, ARCH_CONVERT, 1);
+        xfs_trans_agflist_delta(tp, 1);
+        pag->pagf_flcount++;
+        ASSERT(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE(mp));
+        blockp = &agfl->agfl_bno[INT_GET(agf->agf_fllast, ARCH_CONVERT)];
+        INT_SET(*blockp, ARCH_CONVERT, bno);
+        TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+        xfs_trans_log_buf(tp, agflbp,
+                (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
+                (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
+                        sizeof(xfs_agblock_t) - 1));
+        return 0;
+}
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error */
+xfs_alloc_read_agf(
+        xfs_mount_t     *mp,            /* mount point structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        int             flags,          /* XFS_ALLOC_FLAG_... */
+        xfs_buf_t       **bpp)          /* buffer for the ag freelist header */
+{
+        xfs_agf_t       *agf;           /* ag freelist header */
+        int             agf_ok;         /* set if agf is consistent */
+        xfs_buf_t       *bp;            /* return value */
+        xfs_perag_t     *pag;           /* per allocation group data */
+        int             error;
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_trans_read_buf(
+                        mp, tp, mp->m_ddev_targp,
+                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                        XFS_FSS_TO_BB(mp, 1),
+                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
+                        &bp);
+        if (error)
+                return error;
+        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        if (!bp) {
+                *bpp = NULL;
+                return 0;
+        }
+        /*
+         * Validate the magic number of the agf block.
+         */
+        agf = XFS_BUF_TO_AGF(bp);
+        agf_ok =
+                INT_GET(agf->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC &&
+                XFS_AGF_GOOD_VERSION(
+                        INT_GET(agf->agf_versionnum, ARCH_CONVERT)) &&
+                INT_GET(agf->agf_freeblks, ARCH_CONVERT) <=
+                                INT_GET(agf->agf_length, ARCH_CONVERT) &&
+                INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE(mp) &&
+                INT_GET(agf->agf_fllast,  ARCH_CONVERT) < XFS_AGFL_SIZE(mp) &&
+                INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE(mp);
+        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+                        XFS_RANDOM_ALLOC_READ_AGF))) {
+                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
+                                     XFS_ERRLEVEL_LOW, mp, agf);
+                xfs_trans_brelse(tp, bp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        pag = &mp->m_perag[agno];
+        if (!pag->pagf_init) {
+                pag->pagf_freeblks = INT_GET(agf->agf_freeblks, ARCH_CONVERT);
+                pag->pagf_flcount = INT_GET(agf->agf_flcount, ARCH_CONVERT);
+                pag->pagf_longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
+                pag->pagf_levels[XFS_BTNUM_BNOi] =
+                        INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT);
+                pag->pagf_levels[XFS_BTNUM_CNTi] =
+                        INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT);
+                spinlock_init(&pag->pagb_lock, "xfspagb");
+                pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
+                                        sizeof(xfs_perag_busy_t), KM_SLEEP);
+                pag->pagf_init = 1;
+        }
+#ifdef DEBUG
+        else if (!XFS_FORCED_SHUTDOWN(mp)) {
+                ASSERT(pag->pagf_freeblks == INT_GET(agf->agf_freeblks, ARCH_CONVERT));
+                ASSERT(pag->pagf_flcount == INT_GET(agf->agf_flcount, ARCH_CONVERT));
+                ASSERT(pag->pagf_longest == INT_GET(agf->agf_longest, ARCH_CONVERT));
+                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+                       INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT));
+                ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+                       INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT));
+        }
+#endif
+        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int                             /* error */
+xfs_alloc_vextent(
+        xfs_alloc_arg_t *args)  /* allocation argument structure */
+{
+        xfs_agblock_t   agsize; /* allocation group size */
+        int             error;
+        int             flags;  /* XFS_ALLOC_FLAG_... locking flags */
+#ifdef XFS_ALLOC_TRACE
+        static char     fname[] = "xfs_alloc_vextent";
+#endif
+        xfs_extlen_t    minleft;/* minimum left value, temp copy */
+        xfs_mount_t     *mp;    /* mount structure pointer */
+        xfs_agnumber_t  sagno;  /* starting allocation group number */
+        xfs_alloctype_t type;   /* input allocation type */
+        int             bump_rotor = 0;
+        int             no_min = 0;
+        xfs_agnumber_t  rotorstep = xfs_rotorstep; /* inode32 agf stepper */
+        mp = args->mp;
+        type = args->otype = args->type;
+        args->agbno = NULLAGBLOCK;
+        /*
+         * Just fix this up, for the case where the last a.g. is shorter
+         * (or there's only one a.g.) and the caller couldn't easily figure
+         * that out (xfs_bmap_alloc).
+         */
+        agsize = mp->m_sb.sb_agblocks;
+        if (args->maxlen > agsize)
+                args->maxlen = agsize;
+        if (args->alignment == 0)
+                args->alignment = 1;
+        ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+        ASSERT(args->minlen <= args->maxlen);
+        ASSERT(args->minlen <= agsize);
+        ASSERT(args->mod < args->prod);
+        if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+            XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+            args->minlen > args->maxlen || args->minlen > agsize ||
+            args->mod >= args->prod) {
+                args->fsbno = NULLFSBLOCK;
+                TRACE_ALLOC("badargs", args);
+                return 0;
+        }
+        minleft = args->minleft;
+        switch (type) {
+        case XFS_ALLOCTYPE_THIS_AG:
+        case XFS_ALLOCTYPE_NEAR_BNO:
+        case XFS_ALLOCTYPE_THIS_BNO:
+                /*
+                 * These three force us into a single a.g.
+                 */
+                args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                down_read(&mp->m_peraglock);
+                args->pag = &mp->m_perag[args->agno];
+                args->minleft = 0;
+                error = xfs_alloc_fix_freelist(args, 0);
+                args->minleft = minleft;
+                if (error) {
+                        TRACE_ALLOC("nofix", args);
+                        goto error0;
+                }
+                if (!args->agbp) {
+                        up_read(&mp->m_peraglock);
+                        TRACE_ALLOC("noagbp", args);
+                        break;
+                }
+                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+                if ((error = xfs_alloc_ag_vextent(args)))
+                        goto error0;
+                up_read(&mp->m_peraglock);
+                break;
+        case XFS_ALLOCTYPE_START_BNO:
+                /*
+                 * Try near allocation first, then anywhere-in-ag after
+                 * the first a.g. fails.
+                 */
+                if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+                    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+                        args->fsbno = XFS_AGB_TO_FSB(mp,
+                                        ((mp->m_agfrotor / rotorstep) %
+                                        mp->m_sb.sb_agcount), 0);
+                        bump_rotor = 1;
+                }
+                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+                args->type = XFS_ALLOCTYPE_NEAR_BNO;
+                /* FALLTHROUGH */
+        case XFS_ALLOCTYPE_ANY_AG:
+        case XFS_ALLOCTYPE_START_AG:
+        case XFS_ALLOCTYPE_FIRST_AG:
+                /*
+                 * Rotate through the allocation groups looking for a winner.
+                 */
+                if (type == XFS_ALLOCTYPE_ANY_AG) {
+                        /*
+                         * Start with the last place we left off.
+                         */
+                        args->agno = sagno = (mp->m_agfrotor / rotorstep) %
+                                        mp->m_sb.sb_agcount;
+                        args->type = XFS_ALLOCTYPE_THIS_AG;
+                        flags = XFS_ALLOC_FLAG_TRYLOCK;
+                } else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+                        /*
+                         * Start with allocation group given by bno.
+                         */
+                        args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                        args->type = XFS_ALLOCTYPE_THIS_AG;
+                        sagno = 0;
+                        flags = 0;
+                } else {
+                        if (type == XFS_ALLOCTYPE_START_AG)
+                                args->type = XFS_ALLOCTYPE_THIS_AG;
+                        /*
+                         * Start with the given allocation group.
+                         */
+                        args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                        flags = XFS_ALLOC_FLAG_TRYLOCK;
+                }
+                /*
+                 * Loop over allocation groups twice; first time with
+                 * trylock set, second time without.
+                 */
+                down_read(&mp->m_peraglock);
+                for (;;) {
+                        args->pag = &mp->m_perag[args->agno];
+                        if (no_min) args->minleft = 0;
+                        error = xfs_alloc_fix_freelist(args, flags);
+                        args->minleft = minleft;
+                        if (error) {
+                                TRACE_ALLOC("nofix", args);
+                                goto error0;
+                        }
+                        /*
+                         * If we get a buffer back then the allocation will fly.
+                         */
+                        if (args->agbp) {
+                                if ((error = xfs_alloc_ag_vextent(args)))
+                                        goto error0;
+                                break;
+                        }
+                        TRACE_ALLOC("loopfailed", args);
+                        /*
+                         * Didn't work, figure out the next iteration.
+                         */
+                        if (args->agno == sagno &&
+                            type == XFS_ALLOCTYPE_START_BNO)
+                                args->type = XFS_ALLOCTYPE_THIS_AG;
+                        if (++(args->agno) == mp->m_sb.sb_agcount)
+                                args->agno = 0;
+                        /*
+                         * Reached the starting a.g., must either be done
+                         * or switch to non-trylock mode.
+                         */
+                        if (args->agno == sagno) {
+                                if (no_min == 1) {
+                                        args->agbno = NULLAGBLOCK;
+                                        TRACE_ALLOC("allfailed", args);
+                                        break;
+                                }
+                                if (flags == 0) {
+                                        no_min = 1;
+                                } else {
+                                        flags = 0;
+                                        if (type == XFS_ALLOCTYPE_START_BNO) {
+                                                args->agbno = XFS_FSB_TO_AGBNO(mp,
+                                                        args->fsbno);
+                                                args->type = XFS_ALLOCTYPE_NEAR_BNO;
+                                        }
+                                }
+                        }
+                }
+                up_read(&mp->m_peraglock);
+                if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+                        if (args->agno == sagno)
+                                mp->m_agfrotor = (mp->m_agfrotor + 1) %
+                                        (mp->m_sb.sb_agcount * rotorstep);
+                        else
+                                mp->m_agfrotor = (args->agno * rotorstep + 1) %
+                                        (mp->m_sb.sb_agcount * rotorstep);
+                }
+                break;
+        default:
+                ASSERT(0);
+                /* NOTREACHED */
+        }
+        if (args->agbno == NULLAGBLOCK)
+                args->fsbno = NULLFSBLOCK;
+        else {
+                args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+                ASSERT(args->len >= args->minlen);
+                ASSERT(args->len <= args->maxlen);
+                ASSERT(args->agbno % args->alignment == 0);
+                XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+                        args->len);
+#endif
+        }
+        return 0;
+error0:
+        up_read(&mp->m_peraglock);
+        return error;
+}
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int                             /* error */
+xfs_free_extent(
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_fsblock_t   bno,    /* starting block number of extent */
+        xfs_extlen_t    len)    /* length of extent */
+{
+#ifdef DEBUG
+        xfs_agf_t       *agf;   /* a.g. freespace header */
+#endif
+        xfs_alloc_arg_t args;   /* allocation argument structure */
+        int             error;
+        ASSERT(len != 0);
+        args.tp = tp;
+        args.mp = tp->t_mountp;
+        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+        ASSERT(args.agno < args.mp->m_sb.sb_agcount);
+        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+        args.alignment = 1;
+        args.minlen = args.minleft = args.minalignslop = 0;
+        down_read(&args.mp->m_peraglock);
+        args.pag = &args.mp->m_perag[args.agno];
+        if ((error = xfs_alloc_fix_freelist(&args, 0)))
+                goto error0;
+#ifdef DEBUG
+        ASSERT(args.agbp != NULL);
+        agf = XFS_BUF_TO_AGF(args.agbp);
+        ASSERT(args.agbno + len <= INT_GET(agf->agf_length, ARCH_CONVERT));
+#endif
+        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno,
+                len, 0);
+error0:
+        up_read(&args.mp->m_peraglock);
+        return error;
+}
+/*
+ * AG Busy list management
+ * The busy list contains block ranges that have been freed but whose
+ * transacations have not yet hit disk.  If any block listed in a busy
+ * list is reused, the transaction that freed it must be forced to disk
+ * before continuing to use the block.
+ *
+ * xfs_alloc_mark_busy - add to the per-ag busy list
+ * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ */
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+                    xfs_agnumber_t agno,
+                    xfs_agblock_t bno,
+                    xfs_extlen_t len)
+{
+        xfs_mount_t             *mp;
+        xfs_perag_busy_t        *bsy;
+        int                     n;
+        SPLDECL(s);
+        mp = tp->t_mountp;
+        s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+        /* search pagb_list for an open slot */
+        for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+             n < XFS_PAGB_NUM_SLOTS;
+             bsy++, n++) {
+                if (bsy->busy_tp == NULL) {
+                        break;
+                }
+        }
+        if (n < XFS_PAGB_NUM_SLOTS) {
+                bsy = &mp->m_perag[agno].pagb_list[n];
+                mp->m_perag[agno].pagb_count++;
+                TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
+                bsy->busy_start = bno;
+                bsy->busy_length = len;
+                bsy->busy_tp = tp;
+                xfs_trans_add_busy(tp, agno, n);
+        } else {
+                TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
+                /*
+                 * The busy list is full!  Since it is now not possible to
+                 * track the free block, make this a synchronous transaction
+                 * to insure that the block is not reused before this
+                 * transaction commits.
+                 */
+                xfs_trans_set_sync(tp);
+        }
+        mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+}
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+                     xfs_agnumber_t agno,
+                     int idx)
+{
+        xfs_mount_t             *mp;
+        xfs_perag_busy_t        *list;
+        SPLDECL(s);
+        mp = tp->t_mountp;
+        s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+        list = mp->m_perag[agno].pagb_list;
+        ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+        if (list[idx].busy_tp == tp) {
+                TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
+                list[idx].busy_tp = NULL;
+                mp->m_perag[agno].pagb_count--;
+        } else {
+                TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
+        }
+        mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+}
+/*
+ * returns non-zero if any of (agno,bno):len is in a busy list
+ */
+int
+xfs_alloc_search_busy(xfs_trans_t *tp,
+                    xfs_agnumber_t agno,
+                    xfs_agblock_t bno,
+                    xfs_extlen_t len)
+{
+        xfs_mount_t             *mp;
+        xfs_perag_busy_t        *bsy;
+        int                     n;
+        xfs_agblock_t           uend, bend;
+        xfs_lsn_t               lsn;
+        int                     cnt;
+        SPLDECL(s);
+        mp = tp->t_mountp;
+        s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+        cnt = mp->m_perag[agno].pagb_count;
+        uend = bno + len - 1;
+        /* search pagb_list for this slot, skipping open slots */
+        for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+             cnt; bsy++, n++) {
+                /*
+                 * (start1,length1) within (start2, length2)
+                 */
+                if (bsy->busy_tp != NULL) {
+                        bend = bsy->busy_start + bsy->busy_length - 1;
+                        if ((bno > bend) ||
+                            (uend < bsy->busy_start)) {
+                                cnt--;
+                        } else {
+                                TRACE_BUSYSEARCH("xfs_alloc_search_busy",
+                                                 "found1", agno, bno, len, n,
+                                                 tp);
+                                break;
+                        }
+                }
+        }
+        /*
+         * If a block was found, force the log through the LSN of the
+         * transaction that freed the block
+         */
+        if (cnt) {
+                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+                lsn = bsy->busy_tp->t_commit_lsn;
+                mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+                xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+        } else {
+                TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
+                n = -1;
+                mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+        }
+        return n;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
new file mode 100644
index 000000000000..72329c86351c
--- /dev/null
+++ b/fs/xfs/xfs_alloc.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ALLOC_H__
+#define __XFS_ALLOC_H__
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+/*
+ * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
+ */
+typedef enum xfs_alloctype
+{
+        XFS_ALLOCTYPE_ANY_AG,           /* allocate anywhere, use rotor */
+        XFS_ALLOCTYPE_FIRST_AG,         /* ... start at ag 0 */
+        XFS_ALLOCTYPE_START_AG,         /* anywhere, start in this a.g. */
+        XFS_ALLOCTYPE_THIS_AG,          /* anywhere in this a.g. */
+        XFS_ALLOCTYPE_START_BNO,        /* near this block else anywhere */
+        XFS_ALLOCTYPE_NEAR_BNO,         /* in this a.g. and near this block */
+        XFS_ALLOCTYPE_THIS_BNO          /* at exactly this block */
+} xfs_alloctype_t;
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define XFS_ALLOC_FLAG_TRYLOCK  0x00000001  /* use trylock for buffer locking */
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+        struct xfs_trans *tp;           /* transaction pointer */
+        struct xfs_mount *mp;           /* file system mount point */
+        struct xfs_buf  *agbp;          /* buffer for a.g. freelist header */
+        struct xfs_perag *pag;          /* per-ag struct for this agno */
+        xfs_fsblock_t   fsbno;          /* file system block number */
+        xfs_agnumber_t  agno;           /* allocation group number */
+        xfs_agblock_t   agbno;          /* allocation group-relative block # */
+        xfs_extlen_t    minlen;         /* minimum size of extent */
+        xfs_extlen_t    maxlen;         /* maximum size of extent */
+        xfs_extlen_t    mod;            /* mod value for extent size */
+        xfs_extlen_t    prod;           /* prod value for extent size */
+        xfs_extlen_t    minleft;        /* min blocks must be left after us */
+        xfs_extlen_t    total;          /* total blocks needed in xaction */
+        xfs_extlen_t    alignment;      /* align answer to multiple of this */
+        xfs_extlen_t    minalignslop;   /* slop for minlen+alignment calcs */
+        xfs_extlen_t    len;            /* output: actual size of extent */
+        xfs_alloctype_t type;           /* allocation type XFS_ALLOCTYPE_... */
+        xfs_alloctype_t otype;          /* original allocation type */
+        char            wasdel;         /* set if allocation was prev delayed */
+        char            wasfromfl;      /* set if allocation is from freelist */
+        char            isfl;           /* set if is freelist blocks - !actg */
+        char            userdata;       /* set if this is user data */
+} xfs_alloc_arg_t;
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA              1       /* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA     2       /* special case start of file */
+#ifdef __KERNEL__
+#if defined(XFS_ALLOC_TRACE)
+/*
+ * Allocation tracing buffer size.
+ */
+#define XFS_ALLOC_TRACE_SIZE    4096
+extern ktrace_t *xfs_alloc_trace_buf;
+/*
+ * Types for alloc tracing.
+ */
+#define XFS_ALLOC_KTRACE_ALLOC  1
+#define XFS_ALLOC_KTRACE_FREE   2
+#define XFS_ALLOC_KTRACE_MODAGF 3
+#define XFS_ALLOC_KTRACE_BUSY   4
+#define XFS_ALLOC_KTRACE_UNBUSY 5
+#define XFS_ALLOC_KTRACE_BUSYSEARCH     6
+#endif
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+        struct xfs_mount        *mp);   /* file system mount structure */
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int                             /* error */
+xfs_alloc_get_freelist(
+        struct xfs_trans *tp,   /* transaction pointer */
+        struct xfs_buf  *agbp,  /* buffer containing the agf structure */
+        xfs_agblock_t   *bnop); /* block address retrieved from freelist */
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+        struct xfs_trans *tp,   /* transaction pointer */
+        struct xfs_buf  *bp,    /* buffer for a.g. freelist header */
+        int             fields);/* mask of fields to be logged (XFS_AGF_...) */
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int                             /* error */
+xfs_alloc_pagf_init(
+        struct xfs_mount *mp,   /* file system mount structure */
+        struct xfs_trans *tp,   /* transaction pointer */
+        xfs_agnumber_t  agno,   /* allocation group number */
+        int             flags); /* XFS_ALLOC_FLAGS_... */
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int                             /* error */
+xfs_alloc_put_freelist(
+        struct xfs_trans *tp,   /* transaction pointer */
+        struct xfs_buf  *agbp,  /* buffer for a.g. freelist header */
+        struct xfs_buf  *agflbp,/* buffer for a.g. free block array */
+        xfs_agblock_t   bno);   /* block being freed */
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error  */
+xfs_alloc_read_agf(
+        struct xfs_mount *mp,           /* mount point structure */
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        int             flags,          /* XFS_ALLOC_FLAG_... */
+        struct xfs_buf  **bpp);         /* buffer for the ag freelist header */
+/*
+ * Allocate an extent (variable-size).
+ */
+int                             /* error */
+xfs_alloc_vextent(
+        xfs_alloc_arg_t *args); /* allocation argument structure */
+/*
+ * Free an extent.
+ */
+int                             /* error */
+xfs_free_extent(
+        struct xfs_trans *tp,   /* transaction pointer */
+        xfs_fsblock_t   bno,    /* starting block number of extent */
+        xfs_extlen_t    len);   /* length of extent */
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+                xfs_agnumber_t agno,
+                xfs_agblock_t bno,
+                xfs_extlen_t len);
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+                xfs_agnumber_t ag,
+                int idx);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
new file mode 100644
index 000000000000..e0355a12d946
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -0,0 +1,2204 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * Free space allocation for XFS.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+/*
+ * Prototypes for internal functions.
+ */
+STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
+STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+                xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
+/*
+ * Internal functions.
+ */
+/*
+ * Single level of the xfs_alloc_delete record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                              /* error */
+xfs_alloc_delrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level removing record from */
+        int                     *stat)  /* fail/done/go-on */
+{
+        xfs_agf_t               *agf;   /* allocation group freelist header */
+        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_agblock_t           bno;    /* btree block number */
+        xfs_buf_t               *bp;    /* buffer for block */
+        int                     error;  /* error return value */
+        int                     i;      /* loop index */
+        xfs_alloc_key_t         key;    /* kp points here if block is level 0 */
+        xfs_agblock_t           lbno;   /* left block's block number */
+        xfs_buf_t               *lbp;   /* left block's buffer pointer */
+        xfs_alloc_block_t       *left;  /* left btree block */
+        xfs_alloc_key_t         *lkp=NULL;      /* left block key pointer */
+        xfs_alloc_ptr_t         *lpp=NULL;      /* left block address pointer */
+        int                     lrecs=0;        /* number of records in left block */
+        xfs_alloc_rec_t         *lrp;   /* left block record pointer */
+        xfs_mount_t             *mp;    /* mount structure */
+        int                     ptr;    /* index in btree block for this rec */
+        xfs_agblock_t           rbno;   /* right block's block number */
+        xfs_buf_t               *rbp;   /* right block's buffer pointer */
+        xfs_alloc_block_t       *right; /* right btree block */
+        xfs_alloc_key_t         *rkp;   /* right block key pointer */
+        xfs_alloc_ptr_t         *rpp;   /* right block address pointer */
+        int                     rrecs=0;        /* number of records in right block */
+        xfs_alloc_rec_t         *rrp;   /* right block record pointer */
+        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
+        /*
+         * Get the index of the entry being deleted, check for nothing there.
+         */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Get the buffer & block containing the record or key/ptr.
+         */
+        bp = cur->bc_bufs[level];
+        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                return error;
+#endif
+        /*
+         * Fail if we're off the end of the block.
+         */
+        if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                *stat = 0;
+                return 0;
+        }
+        XFS_STATS_INC(xs_abt_delrec);
+        /*
+         * It's a nonleaf.  Excise the key and ptr being deleted, by
+         * sliding the entries past them down one.
+         * Log the changed areas of the block.
+         */
+        if (level > 0) {
+                lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+                lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+                for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                        memmove(&lkp[ptr - 1], &lkp[ptr],
+                                (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lkp)); /* INT_: mem copy */
+                        memmove(&lpp[ptr - 1], &lpp[ptr],
+                                (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lpp)); /* INT_: mem copy */
+                        xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+                        xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+                }
+        }
+        /*
+         * It's a leaf.  Excise the record being deleted, by sliding the
+         * entries past it down one.  Log the changed areas of the block.
+         */
+        else {
+                lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+                if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                        memmove(&lrp[ptr - 1], &lrp[ptr],
+                                (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lrp));
+                        xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                if (ptr == 1) {
+                        key.ar_startblock = lrp->ar_startblock; /* INT_: direct copy */
+                        key.ar_blockcount = lrp->ar_blockcount; /* INT_: direct copy */
+                        lkp = &key;
+                }
+        }
+        /*
+         * Decrement and log the number of entries in the block.
+         */
+        INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1);
+        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+        /*
+         * See if the longest free extent in the allocation group was
+         * changed by this operation.  True if it's the by-size btree, and
+         * this is the leaf level, and there is no right sibling block,
+         * and this was the last record.
+         */
+        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+        mp = cur->bc_mp;
+        if (level == 0 &&
+            cur->bc_btnum == XFS_BTNUM_CNT &&
+            INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+            ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                ASSERT(ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT) + 1);
+                /*
+                 * There are still records in the block.  Grab the size
+                 * from the last one.
+                 */
+                if (INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                        rrp = XFS_ALLOC_REC_ADDR(block, INT_GET(block->bb_numrecs, ARCH_CONVERT), cur);
+                        INT_COPY(agf->agf_longest, rrp->ar_blockcount, ARCH_CONVERT);
+                }
+                /*
+                 * No free extents left.
+                 */
+                else
+                        agf->agf_longest = 0;
+                mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest =
+                        INT_GET(agf->agf_longest, ARCH_CONVERT);
+                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+                        XFS_AGF_LONGEST);
+        }
+        /*
+         * Is this the root level?  If so, we're almost done.
+         */
+        if (level == cur->bc_nlevels - 1) {
+                /*
+                 * If this is the root level,
+                 * and there's only one entry left,
+                 * and it's NOT the leaf level,
+                 * then we can get rid of this level.
+                 */
+                if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) {
+                        /*
+                         * lpp is still set to the first pointer in the block.
+                         * Make it the new root of the btree.
+                         */
+                        bno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
+                        INT_COPY(agf->agf_roots[cur->bc_btnum], *lpp, ARCH_CONVERT);
+                        INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, -1);
+                        mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_levels[cur->bc_btnum]--;
+                        /*
+                         * Put this buffer/block on the ag's freelist.
+                         */
+                        if ((error = xfs_alloc_put_freelist(cur->bc_tp,
+                                        cur->bc_private.a.agbp, NULL, bno)))
+                                return error;
+                        /*
+                         * Since blocks move to the free list without the
+                         * coordination used in xfs_bmap_finish, we can't allow
+                         * block to be available for reallocation and
+                         * non-transaction writing (user data) until we know
+                         * that the transaction that moved it to the free list
+                         * is permanently on disk. We track the blocks by
+                         * declaring these blocks as "busy"; the busy list is
+                         * maintained on a per-ag basis and each transaction
+                         * records which entries should be removed when the
+                         * iclog commits to disk. If a busy block is
+                         * allocated, the iclog is pushed up to the LSN
+                         * that freed the block.
+                         */
+                        xfs_alloc_mark_busy(cur->bc_tp,
+                                INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
+                        xfs_trans_agbtree_delta(cur->bc_tp, -1);
+                        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+                                XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+                        /*
+                         * Update the cursor so there's one fewer level.
+                         */
+                        xfs_btree_setbuf(cur, level, NULL);
+                        cur->bc_nlevels--;
+                } else if (level > 0 &&
+                           (error = xfs_alloc_decrement(cur, level, &i)))
+                        return error;
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we deleted the leftmost entry in the block, update the
+         * key values above us in the tree.
+         */
+        if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
+                return error;
+        /*
+         * If the number of records remaining in the block is at least
+         * the minimum, we're done.
+         */
+        if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
+                        return error;
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * Otherwise, we have to move some records around to keep the
+         * tree balanced.  Look at the left and right sibling blocks to
+         * see if we can re-balance by moving only one record.
+         */
+        rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+        lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+        bno = NULLAGBLOCK;
+        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
+        /*
+         * Duplicate the cursor so our btree manipulations here won't
+         * disrupt the next level up.
+         */
+        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+                return error;
+        /*
+         * If there's a right sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (rbno != NULLAGBLOCK) {
+                /*
+                 * Move the temp cursor to the last entry in the next block.
+                 * Actually any entry but the first would suffice.
+                 */
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_alloc_increment(tcur, level, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Grab a pointer to the block.
+                 */
+                rbp = tcur->bc_bufs[level];
+                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                        goto error0;
+#endif
+                /*
+                 * Grab the current block number, for future use.
+                 */
+                bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+                /*
+                 * If right block is full enough so that removing one entry
+                 * won't make it too empty, and left-shifting an entry out
+                 * of right to us works, we're done.
+                 */
+                if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+                        if ((error = xfs_alloc_lshift(tcur, level, &i)))
+                                goto error0;
+                        if (i) {
+                                ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
+                                xfs_btree_del_cursor(tcur,
+                                                     XFS_BTREE_NOERROR);
+                                if (level > 0 &&
+                                    (error = xfs_alloc_decrement(cur, level,
+                                            &i)))
+                                        return error;
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference, and fix up the temp cursor to point
+                 * to our block again (last record).
+                 */
+                rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+                if (lbno != NULLAGBLOCK) {
+                        i = xfs_btree_firstrec(tcur, level);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        if ((error = xfs_alloc_decrement(tcur, level, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                }
+        }
+        /*
+         * If there's a left sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (lbno != NULLAGBLOCK) {
+                /*
+                 * Move the temp cursor to the first entry in the
+                 * previous block.
+                 */
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_alloc_decrement(tcur, level, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                xfs_btree_firstrec(tcur, level);
+                /*
+                 * Grab a pointer to the block.
+                 */
+                lbp = tcur->bc_bufs[level];
+                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                        goto error0;
+#endif
+                /*
+                 * Grab the current block number, for future use.
+                 */
+                bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+                /*
+                 * If left block is full enough so that removing one entry
+                 * won't make it too empty, and right-shifting an entry out
+                 * of left to us works, we're done.
+                 */
+                if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+                        if ((error = xfs_alloc_rshift(tcur, level, &i)))
+                                goto error0;
+                        if (i) {
+                                ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
+                                xfs_btree_del_cursor(tcur,
+                                                     XFS_BTREE_NOERROR);
+                                if (level == 0)
+                                        cur->bc_ptrs[0]++;
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference.
+                 */
+                lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        }
+        /*
+         * Delete the temp cursor, we're done with it.
+         */
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        /*
+         * If here, we need to do a join to keep the tree balanced.
+         */
+        ASSERT(bno != NULLAGBLOCK);
+        /*
+         * See if we can join with the left neighbor block.
+         */
+        if (lbno != NULLAGBLOCK &&
+            lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+                /*
+                 * Set "right" to be the starting block,
+                 * "left" to be the left neighbor.
+                 */
+                rbno = bno;
+                right = block;
+                rbp = bp;
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.a.agno, lbno, 0, &lbp,
+                                XFS_ALLOC_BTREE_REF)))
+                        return error;
+                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                        return error;
+        }
+        /*
+         * If that won't work, see if we can join with the right neighbor block.
+         */
+        else if (rbno != NULLAGBLOCK &&
+                 rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+                  XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+                /*
+                 * Set "left" to be the starting block,
+                 * "right" to be the right neighbor.
+                 */
+                lbno = bno;
+                left = block;
+                lbp = bp;
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.a.agno, rbno, 0, &rbp,
+                                XFS_ALLOC_BTREE_REF)))
+                        return error;
+                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                        return error;
+        }
+        /*
+         * Otherwise, we can't fix the imbalance.
+         * Just return.  This is probably a logic error, but it's not fatal.
+         */
+        else {
+                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
+                        return error;
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * We're now going to join "left" and "right" by moving all the stuff
+         * in "right" to "left" and deleting "right".
+         */
+        if (level > 0) {
+                /*
+                 * It's a non-leaf.  Move keys and pointers.
+                 */
+                lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+                lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memcpy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp)); /* INT_: structure copy */
+                memcpy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp)); /* INT_: structure copy */
+                xfs_alloc_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+                                   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                xfs_alloc_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+                                   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+        } else {
+                /*
+                 * It's a leaf.  Move records.
+                 */
+                lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                memcpy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp));
+                xfs_alloc_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+                                   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+        }
+        /*
+         * If we joined with the left neighbor, set the buffer in the
+         * cursor to the left block, and fix up the index.
+         */
+        if (bp != lbp) {
+                xfs_btree_setbuf(cur, level, lbp);
+                cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        }
+        /*
+         * If we joined with the right neighbor and there's a level above
+         * us, increment the cursor at that level.
+         */
+        else if (level + 1 < cur->bc_nlevels &&
+                 (error = xfs_alloc_increment(cur, level + 1, &i)))
+                return error;
+        /*
+         * Fix up the number of records in the surviving block.
+         */
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+        /*
+         * Fix up the right block pointer in the surviving block, and log it.
+         */
+        left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
+        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there is a right sibling now, make it point to the
+         * remaining block.
+         */
+        if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                xfs_alloc_block_t       *rrblock;
+                xfs_buf_t               *rrbp;
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+                                &rrbp, XFS_ALLOC_BTREE_REF)))
+                        return error;
+                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+                        return error;
+                INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * Free the deleting block by putting it on the freelist.
+         */
+        if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                        NULL, rbno)))
+                return error;
+        /*
+         * Since blocks move to the free list without the coordination
+         * used in xfs_bmap_finish, we can't allow block to be available
+         * for reallocation and non-transaction writing (user data)
+         * until we know that the transaction that moved it to the free
+         * list is permanently on disk. We track the blocks by declaring
+         * these blocks as "busy"; the busy list is maintained on a
+         * per-ag basis and each transaction records which entries
+         * should be removed when the iclog commits to disk. If a
+         * busy block is allocated, the iclog is pushed up to the
+         * LSN that freed the block.
+         */
+        xfs_alloc_mark_busy(cur->bc_tp,
+                INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
+        xfs_trans_agbtree_delta(cur->bc_tp, -1);
+        /*
+         * Adjust the current level's cursor so that we're left referring
+         * to the right node, after we're done.
+         * If this leaves the ptr value 0 our caller will fix it up.
+         */
+        if (level > 0)
+                cur->bc_ptrs[level]--;
+        /*
+         * Return value means the next level up has something to do.
+         */
+        *stat = 2;
+        return 0;
+error0:
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int                              /* error */
+xfs_alloc_insrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to insert record at */
+        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
+        xfs_alloc_rec_t         *recp,  /* i/o: record data inserted */
+        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
+        int                     *stat)  /* output: success/failure */
+{
+        xfs_agf_t               *agf;   /* allocation group freelist header */
+        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_buf_t               *bp;    /* buffer for block */
+        int                     error;  /* error return value */
+        int                     i;      /* loop index */
+        xfs_alloc_key_t         key;    /* key value being inserted */
+        xfs_alloc_key_t         *kp;    /* pointer to btree keys */
+        xfs_agblock_t           nbno;   /* block number of allocated block */
+        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
+        xfs_alloc_key_t         nkey;   /* new key value, from split */
+        xfs_alloc_rec_t         nrec;   /* new record value, for caller */
+        int                     optr;   /* old ptr value */
+        xfs_alloc_ptr_t         *pp;    /* pointer to btree addresses */
+        int                     ptr;    /* index in btree block for this rec */
+        xfs_alloc_rec_t         *rp;    /* pointer to btree records */
+        ASSERT(INT_GET(recp->ar_blockcount, ARCH_CONVERT) > 0);
+        /*
+         * If we made it to the root level, allocate a new root block
+         * and we're done.
+         */
+        if (level >= cur->bc_nlevels) {
+                XFS_STATS_INC(xs_abt_insrec);
+                if ((error = xfs_alloc_newroot(cur, &i)))
+                        return error;
+                *bnop = NULLAGBLOCK;
+                *stat = i;
+                return 0;
+        }
+        /*
+         * Make a key out of the record data to be inserted, and save it.
+         */
+        key.ar_startblock = recp->ar_startblock; /* INT_: direct copy */
+        key.ar_blockcount = recp->ar_blockcount; /* INT_: direct copy */
+        optr = ptr = cur->bc_ptrs[level];
+        /*
+         * If we're off the left edge, return failure.
+         */
+        if (ptr == 0) {
+                *stat = 0;
+                return 0;
+        }
+        XFS_STATS_INC(xs_abt_insrec);
+        /*
+         * Get pointers to the btree buffer and block.
+         */
+        bp = cur->bc_bufs[level];
+        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                return error;
+        /*
+         * Check that the new entry is being inserted in the right place.
+         */
+        if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                if (level == 0) {
+                        rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
+                } else {
+                        kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
+                }
+        }
+#endif
+        nbno = NULLAGBLOCK;
+        ncur = (xfs_btree_cur_t *)0;
+        /*
+         * If the block is full, we can't insert the new entry until we
+         * make the block un-full.
+         */
+        if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+                /*
+                 * First, try shifting an entry to the right neighbor.
+                 */
+                if ((error = xfs_alloc_rshift(cur, level, &i)))
+                        return error;
+                if (i) {
+                        /* nothing */
+                }
+                /*
+                 * Next, try shifting an entry to the left neighbor.
+                 */
+                else {
+                        if ((error = xfs_alloc_lshift(cur, level, &i)))
+                                return error;
+                        if (i)
+                                optr = ptr = cur->bc_ptrs[level];
+                        else {
+                                /*
+                                 * Next, try splitting the current block in
+                                 * half. If this works we have to re-set our
+                                 * variables because we could be in a
+                                 * different block now.
+                                 */
+                                if ((error = xfs_alloc_split(cur, level, &nbno,
+                                                &nkey, &ncur, &i)))
+                                        return error;
+                                if (i) {
+                                        bp = cur->bc_bufs[level];
+                                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+                                        if ((error =
+                                                xfs_btree_check_sblock(cur,
+                                                        block, level, bp)))
+                                                return error;
+#endif
+                                        ptr = cur->bc_ptrs[level];
+                                        nrec.ar_startblock = nkey.ar_startblock; /* INT_: direct copy */
+                                        nrec.ar_blockcount = nkey.ar_blockcount; /* INT_: direct copy */
+                                }
+                                /*
+                                 * Otherwise the insert fails.
+                                 */
+                                else {
+                                        *stat = 0;
+                                        return 0;
+                                }
+                        }
+                }
+        }
+        /*
+         * At this point we know there's room for our new entry in the block
+         * we're pointing at.
+         */
+        if (level > 0) {
+                /*
+                 * It's a non-leaf entry.  Make a hole for the new data
+                 * in the key and ptr regions of the block.
+                 */
+                kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+                pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+                for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memmove(&kp[ptr], &kp[ptr - 1],
+                        (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp)); /* INT_: copy */
+                memmove(&pp[ptr], &pp[ptr - 1],
+                        (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp)); /* INT_: copy */
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
+                        return error;
+#endif
+                /*
+                 * Now stuff the new data in, bump numrecs and log the new data.
+                 */
+                kp[ptr - 1] = key;
+                INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+                INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+                xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+                xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+#ifdef DEBUG
+                if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
+                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
+                                kp + ptr);
+#endif
+        } else {
+                /*
+                 * It's a leaf entry.  Make a hole for the new record.
+                 */
+                rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+                memmove(&rp[ptr], &rp[ptr - 1],
+                        (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp));
+                /*
+                 * Now stuff the new record in, bump numrecs
+                 * and log the new data.
+                 */
+                rp[ptr - 1] = *recp; /* INT_: struct copy */
+                INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+                xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+#ifdef DEBUG
+                if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
+                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
+                                rp + ptr);
+#endif
+        }
+        /*
+         * Log the new number of records in the btree header.
+         */
+        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+        /*
+         * If we inserted at the start of a block, update the parents' keys.
+         */
+        if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
+                return error;
+        /*
+         * Look to see if the longest extent in the allocation group
+         * needs to be updated.
+         */
+        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+        if (level == 0 &&
+            cur->bc_btnum == XFS_BTNUM_CNT &&
+            INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+            INT_GET(recp->ar_blockcount, ARCH_CONVERT) > INT_GET(agf->agf_longest, ARCH_CONVERT)) {
+                /*
+                 * If this is a leaf in the by-size btree and there
+                 * is no right sibling block and this block is bigger
+                 * than the previous longest block, update it.
+                 */
+                INT_COPY(agf->agf_longest, recp->ar_blockcount, ARCH_CONVERT);
+                cur->bc_mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest
+                        = INT_GET(recp->ar_blockcount, ARCH_CONVERT);
+                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+                        XFS_AGF_LONGEST);
+        }
+        /*
+         * Return the new block number, if any.
+         * If there is one, give back a record value and a cursor too.
+         */
+        *bnop = nbno;
+        if (nbno != NULLAGBLOCK) {
+                *recp = nrec; /* INT_: struct copy */
+                *curp = ncur; /* INT_: struct copy */
+        }
+        *stat = 1;
+        return 0;
+}
+/*
+ * Log header fields from a btree block.
+ */
+STATIC void
+xfs_alloc_log_block(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     fields) /* mask of fields: XFS_BB_... */
+{
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        static const short      offsets[] = {   /* table of offsets */
+                offsetof(xfs_alloc_block_t, bb_magic),
+                offsetof(xfs_alloc_block_t, bb_level),
+                offsetof(xfs_alloc_block_t, bb_numrecs),
+                offsetof(xfs_alloc_block_t, bb_leftsib),
+                offsetof(xfs_alloc_block_t, bb_rightsib),
+                sizeof(xfs_alloc_block_t)
+        };
+        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+        xfs_trans_log_buf(tp, bp, first, last);
+}
+/*
+ * Log keys from a btree block (nonleaf).
+ */
+STATIC void
+xfs_alloc_log_keys(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     kfirst, /* index of first key to log */
+        int                     klast)  /* index of last key to log */
+{
+        xfs_alloc_block_t       *block; /* btree block to log from */
+        int                     first;  /* first byte offset logged */
+        xfs_alloc_key_t         *kp;    /* key pointer in btree block */
+        int                     last;   /* last byte offset logged */
+        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_alloc_log_ptrs(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     pfirst, /* index of first pointer to log */
+        int                     plast)  /* index of last pointer to log */
+{
+        xfs_alloc_block_t       *block; /* btree block to log from */
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        xfs_alloc_ptr_t         *pp;    /* block-pointer pointer in btree blk */
+        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+/*
+ * Log records from a btree block (leaf).
+ */
+STATIC void
+xfs_alloc_log_recs(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     rfirst, /* index of first record to log */
+        int                     rlast)  /* index of last record to log */
+{
+        xfs_alloc_block_t       *block; /* btree block to log from */
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        xfs_alloc_rec_t         *rp;    /* record pointer for btree block */
+        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+#ifdef DEBUG
+        {
+                xfs_agf_t       *agf;
+                xfs_alloc_rec_t *p;
+                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+                for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
+                        ASSERT(INT_GET(p->ar_startblock, ARCH_CONVERT) + INT_GET(p->ar_blockcount, ARCH_CONVERT) <=
+                               INT_GET(agf->agf_length, ARCH_CONVERT));
+        }
+#endif
+        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        int                     *stat)  /* success/failure */
+{
+        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        xfs_agnumber_t          agno;   /* allocation group number */
+        xfs_alloc_block_t       *block=NULL;    /* current btree block */
+        int                     diff;   /* difference for the current key */
+        int                     error;  /* error return value */
+        int                     keyno=0;        /* current key number */
+        int                     level;  /* level in the btree */
+        xfs_mount_t             *mp;    /* file system mount point */
+        XFS_STATS_INC(xs_abt_lookup);
+        /*
+         * Get the allocation group header, and the root block number.
+         */
+        mp = cur->bc_mp;
+        {
+                xfs_agf_t       *agf;   /* a.g. freespace header */
+                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+                agno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+                agbno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
+        }
+        /*
+         * Iterate over each level in the btree, starting at the root.
+         * For each level above the leaves, find the key we need, based
+         * on the lookup record, then follow the corresponding block
+         * pointer down to the next level.
+         */
+        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+                xfs_buf_t       *bp;    /* buffer pointer for btree block */
+                xfs_daddr_t     d;      /* disk address of btree block */
+                /*
+                 * Get the disk address we're looking for.
+                 */
+                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+                /*
+                 * If the old buffer at this level is for a different block,
+                 * throw it away, otherwise just use it.
+                 */
+                bp = cur->bc_bufs[level];
+                if (bp && XFS_BUF_ADDR(bp) != d)
+                        bp = (xfs_buf_t *)0;
+                if (!bp) {
+                        /*
+                         * Need to get a new buffer.  Read it, then
+                         * set it in the cursor, releasing the old one.
+                         */
+                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
+                                        agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
+                                return error;
+                        xfs_btree_setbuf(cur, level, bp);
+                        /*
+                         * Point to the btree block, now that we have the buffer
+                         */
+                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+                        if ((error = xfs_btree_check_sblock(cur, block, level,
+                                        bp)))
+                                return error;
+                } else
+                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+                /*
+                 * If we already had a key match at a higher level, we know
+                 * we need to use the first entry in this block.
+                 */
+                if (diff == 0)
+                        keyno = 1;
+                /*
+                 * Otherwise we need to search this block.  Do a binary search.
+                 */
+                else {
+                        int             high;   /* high entry number */
+                        xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
+                        xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
+                        int             low;    /* low entry number */
+                        /*
+                         * Get a pointer to keys or records.
+                         */
+                        if (level > 0)
+                                kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+                        else
+                                krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
+                        /*
+                         * Set low and high entry numbers, 1-based.
+                         */
+                        low = 1;
+                        if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+                                /*
+                                 * If the block is empty, the tree must
+                                 * be an empty leaf.
+                                 */
+                                ASSERT(level == 0 && cur->bc_nlevels == 1);
+                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                                *stat = 0;
+                                return 0;
+                        }
+                        /*
+                         * Binary search the block.
+                         */
+                        while (low <= high) {
+                                xfs_extlen_t    blockcount;     /* key value */
+                                xfs_agblock_t   startblock;     /* key value */
+                                XFS_STATS_INC(xs_abt_compare);
+                                /*
+                                 * keyno is average of low and high.
+                                 */
+                                keyno = (low + high) >> 1;
+                                /*
+                                 * Get startblock & blockcount.
+                                 */
+                                if (level > 0) {
+                                        xfs_alloc_key_t *kkp;
+                                        kkp = kkbase + keyno - 1;
+                                        startblock = INT_GET(kkp->ar_startblock, ARCH_CONVERT);
+                                        blockcount = INT_GET(kkp->ar_blockcount, ARCH_CONVERT);
+                                } else {
+                                        xfs_alloc_rec_t *krp;
+                                        krp = krbase + keyno - 1;
+                                        startblock = INT_GET(krp->ar_startblock, ARCH_CONVERT);
+                                        blockcount = INT_GET(krp->ar_blockcount, ARCH_CONVERT);
+                                }
+                                /*
+                                 * Compute difference to get next direction.
+                                 */
+                                if (cur->bc_btnum == XFS_BTNUM_BNO)
+                                        diff = (int)startblock -
+                                               (int)cur->bc_rec.a.ar_startblock;
+                                else if (!(diff = (int)blockcount -
+                                            (int)cur->bc_rec.a.ar_blockcount))
+                                        diff = (int)startblock -
+                                            (int)cur->bc_rec.a.ar_startblock;
+                                /*
+                                 * Less than, move right.
+                                 */
+                                if (diff < 0)
+                                        low = keyno + 1;
+                                /*
+                                 * Greater than, move left.
+                                 */
+                                else if (diff > 0)
+                                        high = keyno - 1;
+                                /*
+                                 * Equal, we're done.
+                                 */
+                                else
+                                        break;
+                        }
+                }
+                /*
+                 * If there are more levels, set up for the next level
+                 * by getting the block number and filling in the cursor.
+                 */
+                if (level > 0) {
+                        /*
+                         * If we moved left, need the previous key number,
+                         * unless there isn't one.
+                         */
+                        if (diff > 0 && --keyno < 1)
+                                keyno = 1;
+                        agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
+#ifdef DEBUG
+                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
+                                return error;
+#endif
+                        cur->bc_ptrs[level] = keyno;
+                }
+        }
+        /*
+         * Done with the search.
+         * See if we need to adjust the results.
+         */
+        if (dir != XFS_LOOKUP_LE && diff < 0) {
+                keyno++;
+                /*
+                 * If ge search and we went off the end of the block, but it's
+                 * not the last block, we're in the wrong block.
+                 */
+                if (dir == XFS_LOOKUP_GE &&
+                    keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+                    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                        int     i;
+                        cur->bc_ptrs[0] = keyno;
+                        if ((error = xfs_alloc_increment(cur, 0, &i)))
+                                return error;
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        *stat = 1;
+                        return 0;
+                }
+        }
+        else if (dir == XFS_LOOKUP_LE && diff > 0)
+                keyno--;
+        cur->bc_ptrs[0] = keyno;
+        /*
+         * Return if we succeeded or not.
+         */
+        if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
+                *stat = 0;
+        else
+                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+        return 0;
+}
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                              /* error */
+xfs_alloc_lshift(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to shift record on */
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+#ifdef DEBUG
+        int                     i;      /* loop index */
+#endif
+        xfs_alloc_key_t         key;    /* key value for leaf level upward */
+        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
+        xfs_alloc_block_t       *left;  /* left neighbor btree block */
+        int                     nrec;   /* new number of left block entries */
+        xfs_buf_t               *rbp;   /* buffer for right (current) block */
+        xfs_alloc_block_t       *right; /* right (current) btree block */
+        xfs_alloc_key_t         *rkp=NULL;      /* key pointer for right block */
+        xfs_alloc_ptr_t         *rpp=NULL;      /* address pointer for right block */
+        xfs_alloc_rec_t         *rrp=NULL;      /* record pointer for right block */
+        /*
+         * Set up variables for this block as "right".
+         */
+        rbp = cur->bc_bufs[level];
+        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                return error;
+#endif
+        /*
+         * If we've got no left sibling then we can't shift an entry left.
+         */
+        if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] <= 1) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Set up the left neighbor as "left".
+         */
+        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.a.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
+                        XFS_ALLOC_BTREE_REF)))
+                return error;
+        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                return error;
+        /*
+         * If it's full, it can't take another entry.
+         */
+        if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+                *stat = 0;
+                return 0;
+        }
+        nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+        /*
+         * If non-leaf, copy a key and a ptr to the left block.
+         */
+        if (level > 0) {
+                xfs_alloc_key_t *lkp;   /* key pointer for left block */
+                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
+                lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
+                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+                *lkp = *rkp;
+                xfs_alloc_log_keys(cur, lbp, nrec, nrec);
+                lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
+                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
+                        return error;
+#endif
+                *lpp = *rpp; /* INT_: copy */
+                xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
+                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
+        }
+        /*
+         * If leaf, copy a record to the left block.
+         */
+        else {
+                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
+                lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
+                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                *lrp = *rrp;
+                xfs_alloc_log_recs(cur, lbp, nrec, nrec);
+                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
+        }
+        /*
+         * Bump and log left's numrecs, decrement and log right's numrecs.
+         */
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
+        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+        INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
+        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+        /*
+         * Slide the contents of right down one entry.
+         */
+        if (level > 0) {
+#ifdef DEBUG
+                for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+                                        level)))
+                                return error;
+                }
+#endif
+                memmove(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+                memmove(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+                xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+        } else {
+                memmove(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+                key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+                rkp = &key;
+        }
+        /*
+         * Update the parent key values of right.
+         */
+        if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
+                return error;
+        /*
+         * Slide the cursor value left one.
+         */
+        cur->bc_ptrs[level]--;
+        *stat = 1;
+        return 0;
+}
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                              /* error */
+xfs_alloc_newroot(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        xfs_agblock_t           lbno;   /* left block number */
+        xfs_buf_t               *lbp;   /* left btree buffer */
+        xfs_alloc_block_t       *left;  /* left btree block */
+        xfs_mount_t             *mp;    /* mount structure */
+        xfs_agblock_t           nbno;   /* new block number */
+        xfs_buf_t               *nbp;   /* new (root) buffer */
+        xfs_alloc_block_t       *new;   /* new (root) btree block */
+        int                     nptr;   /* new value for key index, 1 or 2 */
+        xfs_agblock_t           rbno;   /* right block number */
+        xfs_buf_t               *rbp;   /* right btree buffer */
+        xfs_alloc_block_t       *right; /* right btree block */
+        mp = cur->bc_mp;
+        ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
+        /*
+         * Get a buffer from the freelist blocks, for the new root.
+         */
+        if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                        &nbno)))
+                return error;
+        /*
+         * None available, we fail.
+         */
+        if (nbno == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        xfs_trans_agbtree_delta(cur->bc_tp, 1);
+        nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
+                0);
+        new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
+        /*
+         * Set the root data in the a.g. freespace structure.
+         */
+        {
+                xfs_agf_t       *agf;   /* a.g. freespace header */
+                xfs_agnumber_t  seqno;
+                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+                INT_SET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT, nbno);
+                INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, 1);
+                seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+                mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
+                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+                        XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+        }
+        /*
+         * At the previous root level there are now two blocks: the old
+         * root, and the new block generated when it was split.
+         * We don't know which one the cursor is pointing at, so we
+         * set up variables "left" and "right" for each case.
+         */
+        lbp = cur->bc_bufs[cur->bc_nlevels - 1];
+        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
+                return error;
+#endif
+        if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                /*
+                 * Our block is left, pick up the right block.
+                 */
+                lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
+                rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.a.agno, rbno, 0, &rbp,
+                                XFS_ALLOC_BTREE_REF)))
+                        return error;
+                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+                if ((error = xfs_btree_check_sblock(cur, right,
+                                cur->bc_nlevels - 1, rbp)))
+                        return error;
+                nptr = 1;
+        } else {
+                /*
+                 * Our block is right, pick up the left block.
+                 */
+                rbp = lbp;
+                right = left;
+                rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
+                lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.a.agno, lbno, 0, &lbp,
+                                XFS_ALLOC_BTREE_REF)))
+                        return error;
+                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+                if ((error = xfs_btree_check_sblock(cur, left,
+                                cur->bc_nlevels - 1, lbp)))
+                        return error;
+                nptr = 2;
+        }
+        /*
+         * Fill in the new block's btree header and log it.
+         */
+        INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+        INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
+        INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
+        INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+        INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+        xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
+        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
+        /*
+         * Fill in the key data in the new root.
+         */
+        {
+                xfs_alloc_key_t         *kp;    /* btree key pointer */
+                kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
+                if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
+                        kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
+                        kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
+                } else {
+                        xfs_alloc_rec_t *rp;    /* btree record pointer */
+                        rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
+                        kp[0].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
+                        kp[0].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
+                        rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                        kp[1].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
+                        kp[1].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
+                }
+        }
+        xfs_alloc_log_keys(cur, nbp, 1, 2);
+        /*
+         * Fill in the pointer data in the new root.
+         */
+        {
+                xfs_alloc_ptr_t         *pp;    /* btree address pointer */
+                pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
+                INT_SET(pp[0], ARCH_CONVERT, lbno);
+                INT_SET(pp[1], ARCH_CONVERT, rbno);
+        }
+        xfs_alloc_log_ptrs(cur, nbp, 1, 2);
+        /*
+         * Fix up the cursor.
+         */
+        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+        cur->bc_ptrs[cur->bc_nlevels] = nptr;
+        cur->bc_nlevels++;
+        *stat = 1;
+        return 0;
+}
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                              /* error */
+xfs_alloc_rshift(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to shift record on */
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        int                     i;      /* loop index */
+        xfs_alloc_key_t         key;    /* key value for leaf level upward */
+        xfs_buf_t               *lbp;   /* buffer for left (current) block */
+        xfs_alloc_block_t       *left;  /* left (current) btree block */
+        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
+        xfs_alloc_block_t       *right; /* right neighbor btree block */
+        xfs_alloc_key_t         *rkp;   /* key pointer for right block */
+        xfs_btree_cur_t         *tcur;  /* temporary cursor */
+        /*
+         * Set up variables for this block as "left".
+         */
+        lbp = cur->bc_bufs[level];
+        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                return error;
+#endif
+        /*
+         * If we've got no right sibling then we can't shift an entry right.
+         */
+        if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Set up the right neighbor as "right".
+         */
+        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
+                        XFS_ALLOC_BTREE_REF)))
+                return error;
+        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                return error;
+        /*
+         * If it's full, it can't take another entry.
+         */
+        if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Make a hole at the start of the right neighbor block, then
+         * copy the last left block entry to the hole.
+         */
+        if (level > 0) {
+                xfs_alloc_key_t *lkp;   /* key pointer for left block */
+                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
+                xfs_alloc_ptr_t *rpp;   /* address pointer for right block */
+                lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+                memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
+                        return error;
+#endif
+                *rkp = *lkp; /* INT_: copy */
+                *rpp = *lpp; /* INT_: copy */
+                xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+                xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
+        } else {
+                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
+                xfs_alloc_rec_t *rrp;   /* record pointer for right block */
+                lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                *rrp = *lrp;
+                xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+                key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+                key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+                rkp = &key;
+                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
+        }
+        /*
+         * Decrement and log left's numrecs, bump and log right's numrecs.
+         */
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+        INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+        /*
+         * Using a temporary cursor, update the parent key values of the
+         * block on the right.
+         */
+        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+                return error;
+        i = xfs_btree_lastrec(tcur, level);
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        if ((error = xfs_alloc_increment(tcur, level, &i)) ||
+            (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
+                goto error0;
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        *stat = 1;
+        return 0;
+error0:
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int                              /* error */
+xfs_alloc_split(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to split */
+        xfs_agblock_t           *bnop,  /* output: block number allocated */
+        xfs_alloc_key_t         *keyp,  /* output: first key of new block */
+        xfs_btree_cur_t         **curp, /* output: new cursor */
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        int                     i;      /* loop index/record number */
+        xfs_agblock_t           lbno;   /* left (current) block number */
+        xfs_buf_t               *lbp;   /* buffer for left block */
+        xfs_alloc_block_t       *left;  /* left (current) btree block */
+        xfs_agblock_t           rbno;   /* right (new) block number */
+        xfs_buf_t               *rbp;   /* buffer for right block */
+        xfs_alloc_block_t       *right; /* right (new) btree block */
+        /*
+         * Allocate the new block from the freelist.
+         * If we can't do it, we're toast.  Give up.
+         */
+        if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                        &rbno)))
+                return error;
+        if (rbno == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        xfs_trans_agbtree_delta(cur->bc_tp, 1);
+        rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
+                rbno, 0);
+        /*
+         * Set up the new block as "right".
+         */
+        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+        /*
+         * "Left" is the current (according to the cursor) block.
+         */
+        lbp = cur->bc_bufs[level];
+        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                return error;
+#endif
+        /*
+         * Fill in the btree header for the new block.
+         */
+        INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+        right->bb_level = left->bb_level; /* INT_: direct copy */
+        INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+        /*
+         * Make sure that if there's an odd number of entries now, that
+         * each new block will have the same number of entries.
+         */
+        if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+            cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+                INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+        i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+        /*
+         * For non-leaf blocks, copy keys and addresses over to the new block.
+         */
+        if (level > 0) {
+                xfs_alloc_key_t *lkp;   /* left btree key pointer */
+                xfs_alloc_ptr_t *lpp;   /* left btree address pointer */
+                xfs_alloc_key_t *rkp;   /* right btree key pointer */
+                xfs_alloc_ptr_t *rpp;   /* right btree address pointer */
+                lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
+                lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
+                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); /* INT_: copy */
+                memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp)); /* INT_: copy */
+                xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                *keyp = *rkp;
+        }
+        /*
+         * For leaf blocks, copy records over to the new block.
+         */
+        else {
+                xfs_alloc_rec_t *lrp;   /* left btree record pointer */
+                xfs_alloc_rec_t *rrp;   /* right btree record pointer */
+                lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
+                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                keyp->ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+                keyp->ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+        }
+        /*
+         * Find the left block number by looking in the buffer.
+         * Adjust numrecs, sibling pointers.
+         */
+        lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+        right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+        INT_SET(left->bb_rightsib, ARCH_CONVERT, rbno);
+        INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
+        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there's a block to the new block's right, make that block
+         * point back to right instead of to left.
+         */
+        if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                xfs_alloc_block_t       *rrblock;       /* rr btree block */
+                xfs_buf_t               *rrbp;          /* buffer for rrblock */
+                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                                cur->bc_private.a.agno, INT_GET(right->bb_rightsib, ARCH_CONVERT), 0,
+                                &rrbp, XFS_ALLOC_BTREE_REF)))
+                        return error;
+                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+                        return error;
+                INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, rbno);
+                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * If the cursor is really in the right block, move it there.
+         * If it's just pointing past the last entry in left, then we'll
+         * insert there, so don't change anything in that case.
+         */
+        if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+                xfs_btree_setbuf(cur, level, rbp);
+                cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        }
+        /*
+         * If there are more levels, we'll need another cursor which refers to
+         * the right block, no matter where this cursor was.
+         */
+        if (level + 1 < cur->bc_nlevels) {
+                if ((error = xfs_btree_dup_cursor(cur, curp)))
+                        return error;
+                (*curp)->bc_ptrs[level + 1]++;
+        }
+        *bnop = rbno;
+        *stat = 1;
+        return 0;
+}
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int                              /* error */
+xfs_alloc_updkey(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_alloc_key_t         *keyp,  /* new key value to update to */
+        int                     level)  /* starting level for update */
+{
+        int                     ptr;    /* index of key in block */
+        /*
+         * Go up the tree from this level toward the root.
+         * At each level, update the key value to the value input.
+         * Stop when we reach a level where the cursor isn't pointing
+         * at the first entry in the block.
+         */
+        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+                xfs_alloc_block_t       *block; /* btree block */
+                xfs_buf_t               *bp;    /* buffer for block */
+#ifdef DEBUG
+                int                     error;  /* error return value */
+#endif
+                xfs_alloc_key_t         *kp;    /* ptr to btree block keys */
+                bp = cur->bc_bufs[level];
+                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                        return error;
+#endif
+                ptr = cur->bc_ptrs[level];
+                kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+                *kp = *keyp;
+                xfs_alloc_log_keys(cur, bp, ptr, ptr);
+        }
+        return 0;
+}
+/*
+ * Externally visible routines.
+ */
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_alloc_decrement(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat)  /* success/failure */
+{
+        xfs_alloc_block_t       *block; /* btree block */
+        int                     error;  /* error return value */
+        int                     lev;    /* btree level */
+        ASSERT(level < cur->bc_nlevels);
+        /*
+         * Read-ahead to the left at this level.
+         */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+        /*
+         * Decrement the ptr at this level.  If we're still in the block
+         * then we're done.
+         */
+        if (--cur->bc_ptrs[level] > 0) {
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * Get a pointer to the btree block.
+         */
+        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level,
+                        cur->bc_bufs[level])))
+                return error;
+#endif
+        /*
+         * If we just went off the left edge of the tree, return failure.
+         */
+        if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * March up the tree decrementing pointers.
+         * Stop when we don't go off the left edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                if (--cur->bc_ptrs[lev] > 0)
+                        break;
+                /*
+                 * Read-ahead the left block, we're going to read it
+                 * in the next loop.
+                 */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         */
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
+                xfs_agblock_t   agbno;  /* block number of btree block */
+                xfs_buf_t       *bp;    /* buffer pointer for block */
+                agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
+                                XFS_ALLOC_BTREE_REF)))
+                        return error;
+                lev--;
+                xfs_btree_setbuf(cur, lev, bp);
+                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+                        return error;
+                cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+        }
+        *stat = 1;
+        return 0;
+}
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_alloc_delete(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        int             *stat)          /* success/failure */
+{
+        int             error;          /* error return value */
+        int             i;              /* result code */
+        int             level;          /* btree level */
+        /*
+         * Go up the tree, starting at leaf level.
+         * If 2 is returned then a join was done; go to the next level.
+         * Otherwise we are done.
+         */
+        for (level = 0, i = 2; i == 2; level++) {
+                if ((error = xfs_alloc_delrec(cur, level, &i)))
+                        return error;
+        }
+        if (i == 0) {
+                for (level = 1; level < cur->bc_nlevels; level++) {
+                        if (cur->bc_ptrs[level] == 0) {
+                                if ((error = xfs_alloc_decrement(cur, level, &i)))
+                                        return error;
+                                break;
+                        }
+                }
+        }
+        *stat = i;
+        return 0;
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_alloc_get_rec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat)  /* output: success/failure */
+{
+        xfs_alloc_block_t       *block; /* btree block */
+#ifdef DEBUG
+        int                     error;  /* error return value */
+#endif
+        int                     ptr;    /* record number */
+        ptr = cur->bc_ptrs[0];
+        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+                return error;
+#endif
+        /*
+         * Off the right end or left end, return failure.
+         */
+        if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Point to the record and extract its data.
+         */
+        {
+                xfs_alloc_rec_t         *rec;   /* record data */
+                rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+                *bno = INT_GET(rec->ar_startblock, ARCH_CONVERT);
+                *len = INT_GET(rec->ar_blockcount, ARCH_CONVERT);
+        }
+        *stat = 1;
+        return 0;
+}
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_alloc_increment(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat)  /* success/failure */
+{
+        xfs_alloc_block_t       *block; /* btree block */
+        xfs_buf_t               *bp;    /* tree block buffer */
+        int                     error;  /* error return value */
+        int                     lev;    /* btree level */
+        ASSERT(level < cur->bc_nlevels);
+        /*
+         * Read-ahead to the right at this level.
+         */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        /*
+         * Get a pointer to the btree block.
+         */
+        bp = cur->bc_bufs[level];
+        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                return error;
+#endif
+        /*
+         * Increment the ptr at this level.  If we're still in the block
+         * then we're done.
+         */
+        if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we just went off the right edge of the tree, return failure.
+         */
+        if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * March up the tree incrementing pointers.
+         * Stop when we don't go off the right edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                bp = cur->bc_bufs[lev];
+                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+                        return error;
+#endif
+                if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+                        break;
+                /*
+                 * Read-ahead the right block, we're going to read it
+                 * in the next loop.
+                 */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         */
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+             lev > level; ) {
+                xfs_agblock_t   agbno;  /* block number of btree block */
+                agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                                cur->bc_private.a.agno, agbno, 0, &bp,
+                                XFS_ALLOC_BTREE_REF)))
+                        return error;
+                lev--;
+                xfs_btree_setbuf(cur, lev, bp);
+                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+                        return error;
+                cur->bc_ptrs[lev] = 1;
+        }
+        *stat = 1;
+        return 0;
+}
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int                                     /* error */
+xfs_alloc_insert(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        int             *stat)          /* success/failure */
+{
+        int             error;          /* error return value */
+        int             i;              /* result value, 0 for failure */
+        int             level;          /* current level number in btree */
+        xfs_agblock_t   nbno;           /* new block number (split result) */
+        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
+        xfs_alloc_rec_t nrec;           /* record being inserted this level */
+        xfs_btree_cur_t *pcur;          /* previous level's cursor */
+        level = 0;
+        nbno = NULLAGBLOCK;
+        INT_SET(nrec.ar_startblock, ARCH_CONVERT, cur->bc_rec.a.ar_startblock);
+        INT_SET(nrec.ar_blockcount, ARCH_CONVERT, cur->bc_rec.a.ar_blockcount);
+        ncur = (xfs_btree_cur_t *)0;
+        pcur = cur;
+        /*
+         * Loop going up the tree, starting at the leaf level.
+         * Stop when we don't get a split block, that must mean that
+         * the insert is finished with this level.
+         */
+        do {
+                /*
+                 * Insert nrec/nbno into this level of the tree.
+                 * Note if we fail, nbno will be null.
+                 */
+                if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
+                                &i))) {
+                        if (pcur != cur)
+                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                /*
+                 * See if the cursor we just used is trash.
+                 * Can't trash the caller's cursor, but otherwise we should
+                 * if ncur is a new cursor or we're about to be done.
+                 */
+                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
+                        cur->bc_nlevels = pcur->bc_nlevels;
+                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+                }
+                /*
+                 * If we got a new cursor, switch to it.
+                 */
+                if (ncur) {
+                        pcur = ncur;
+                        ncur = (xfs_btree_cur_t *)0;
+                }
+        } while (nbno != NULLAGBLOCK);
+        *stat = i;
+        return 0;
+}
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+int                                     /* error */
+xfs_alloc_lookup_eq(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_agblock_t   bno,            /* starting block of extent */
+        xfs_extlen_t    len,            /* length of extent */
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_alloc_lookup_ge(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_agblock_t   bno,            /* starting block of extent */
+        xfs_extlen_t    len,            /* length of extent */
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_agblock_t   bno,            /* starting block of extent */
+        xfs_extlen_t    len,            /* length of extent */
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur, to the value given by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int                                     /* error */
+xfs_alloc_update(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len)    /* length of extent */
+{
+        xfs_alloc_block_t       *block; /* btree block to update */
+        int                     error;  /* error return value */
+        int                     ptr;    /* current record number (updating) */
+        ASSERT(len > 0);
+        /*
+         * Pick up the a.g. freelist struct and the current block.
+         */
+        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+                return error;
+#endif
+        /*
+         * Get the address of the rec to be updated.
+         */
+        ptr = cur->bc_ptrs[0];
+        {
+                xfs_alloc_rec_t         *rp;    /* pointer to updated record */
+                rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+                /*
+                 * Fill in the new contents and log them.
+                 */
+                INT_SET(rp->ar_startblock, ARCH_CONVERT, bno);
+                INT_SET(rp->ar_blockcount, ARCH_CONVERT, len);
+                xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
+        }
+        /*
+         * If it's the by-size btree and it's the last leaf block and
+         * it's the last record... then update the size of the longest
+         * extent in the a.g., which we cache in the a.g. freelist header.
+         */
+        if (cur->bc_btnum == XFS_BTNUM_CNT &&
+            INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+            ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                xfs_agf_t       *agf;   /* a.g. freespace header */
+                xfs_agnumber_t  seqno;
+                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+                seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+                cur->bc_mp->m_perag[seqno].pagf_longest = len;
+                INT_SET(agf->agf_longest, ARCH_CONVERT, len);
+                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+                        XFS_AGF_LONGEST);
+        }
+        /*
+         * Updating first record in leaf. Pass new key value up to our parent.
+         */
+        if (ptr == 1) {
+                xfs_alloc_key_t key;    /* key containing [bno, len] */
+                INT_SET(key.ar_startblock, ARCH_CONVERT, bno);
+                INT_SET(key.ar_blockcount, ARCH_CONVERT, len);
+                if ((error = xfs_alloc_updkey(cur, &key, 1)))
+                        return error;
+        }
+        return 0;
+}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
new file mode 100644
index 000000000000..ed5161a572ef
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define __XFS_ALLOC_BTREE_H__
+/*
+ * Freespace on-disk structures
+ */
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_btree_sblock;
+struct xfs_mount;
+/*
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC  0x41425442      /* 'ABTB' for bno tree */
+#define XFS_ABTC_MAGIC  0x41425443      /* 'ABTC' for cnt tree */
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec
+{
+        xfs_agblock_t   ar_startblock;  /* starting block number */
+        xfs_extlen_t    ar_blockcount;  /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+typedef xfs_agblock_t xfs_alloc_ptr_t;  /* btree pointer type */
+                                        /* btree block header type */
+typedef struct xfs_btree_sblock xfs_alloc_block_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
+xfs_alloc_block_t *xfs_buf_to_alloc_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_ALLOC_BLOCK(bp)      xfs_buf_to_alloc_block(bp)
+#else
+#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+/*
+ * Real block structures have a size equal to the disk block size.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_SIZE)
+int xfs_alloc_block_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_SIZE(lev,cur)   xfs_alloc_block_size(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_SIZE(lev,cur)   (1 << (cur)->bc_blocklog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
+int xfs_alloc_block_maxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur)        xfs_alloc_block_maxrecs(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur)        \
+        ((cur)->bc_mp->m_alloc_mxr[lev != 0])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
+int xfs_alloc_block_minrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_MINRECS(lev,cur)        xfs_alloc_block_minrecs(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_MINRECS(lev,cur)        \
+        ((cur)->bc_mp->m_alloc_mnr[lev != 0])
+#endif
+/*
+ * Minimum and maximum blocksize and sectorsize.
+ * The blocksize upper limit is pretty much arbitrary.
+ * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ */
+#define XFS_MIN_BLOCKSIZE_LOG   9       /* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG   16      /* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE       (1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE       (1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_SECTORSIZE_LOG  9       /* i.e. 512 bytes */
+#define XFS_MAX_SECTORSIZE_LOG  15      /* i.e. 32768 bytes */
+#define XFS_MIN_SECTORSIZE      (1 << XFS_MIN_SECTORSIZE_LOG)
+#define XFS_MAX_SECTORSIZE      (1 << XFS_MAX_SECTORSIZE_LOG)
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BNO_BLOCK)
+xfs_agblock_t xfs_bno_block(struct xfs_mount *mp);
+#define XFS_BNO_BLOCK(mp)       xfs_bno_block(mp)
+#else
+#define XFS_BNO_BLOCK(mp)       ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CNT_BLOCK)
+xfs_agblock_t xfs_cnt_block(struct xfs_mount *mp);
+#define XFS_CNT_BLOCK(mp)       xfs_cnt_block(mp)
+#else
+#define XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+#endif
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_REC_ADDR)
+xfs_alloc_rec_t *xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i,
+                                    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_REC_ADDR(bb,i,cur)    xfs_alloc_rec_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_REC_ADDR(bb,i,cur)    \
+        XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, bb, i, \
+                XFS_ALLOC_BLOCK_MAXRECS(0, cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_KEY_ADDR)
+xfs_alloc_key_t *xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i,
+                                    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_KEY_ADDR(bb,i,cur)    xfs_alloc_key_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_KEY_ADDR(bb,i,cur)    \
+        XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
+                XFS_ALLOC_BLOCK_MAXRECS(1, cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_PTR_ADDR)
+xfs_alloc_ptr_t *xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i,
+                                    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_PTR_ADDR(bb,i,cur)    xfs_alloc_ptr_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_PTR_ADDR(bb,i,cur)    \
+        XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
+                XFS_ALLOC_BLOCK_MAXRECS(1, cur))
+#endif
+/*
+ * Prototypes for externally visible routines.
+ */
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_alloc_decrement(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat); /* success/failure */
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_alloc_delete(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat); /* success/failure */
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat); /* output: success/failure */
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_alloc_increment(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat); /* success/failure */
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int                                     /* error */
+xfs_alloc_insert(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat); /* success/failure */
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+int                                     /* error */
+xfs_alloc_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_alloc_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat); /* success/failure */
+/*
+ * Update the record referred to by cur, to the value given by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int                                     /* error */
+xfs_alloc_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len);   /* length of extent */
+#endif  /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
new file mode 100644
index 000000000000..ae35189b3d70
--- /dev/null
+++ b/fs/xfs/xfs_arch.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ARCH_H__
+#define __XFS_ARCH_H__
+#ifndef XFS_BIG_INUMS
+# error XFS_BIG_INUMS must be defined true or false
+#endif
+#ifdef __KERNEL__
+#include <asm/byteorder.h>
+#ifdef __LITTLE_ENDIAN
+# define __BYTE_ORDER   __LITTLE_ENDIAN
+#endif
+#ifdef __BIG_ENDIAN
+# define __BYTE_ORDER   __BIG_ENDIAN
+#endif
+#endif  /* __KERNEL__ */
+/* do we need conversion? */
+#define ARCH_NOCONVERT 1
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+# define ARCH_CONVERT   0
+#else
+# define ARCH_CONVERT   ARCH_NOCONVERT
+#endif
+/* generic swapping macros */
+#ifndef HAVE_SWABMACROS
+#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
+#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
+#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
+#endif
+#define INT_SWAP(type, var) \
+    ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
+    ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
+    ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
+    (var))))
+/*
+ * get and set integers from potentially unaligned locations
+ */
+#define INT_GET_UNALIGNED_16_BE(pointer) \
+   ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1])))
+#define INT_SET_UNALIGNED_16_BE(pointer,value) \
+    { \
+        ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \
+        ((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
+    }
+/* define generic INT_ macros */
+#define INT_GET(reference,arch) \
+    (((arch) == ARCH_NOCONVERT) \
+        ? \
+            (reference) \
+        : \
+            INT_SWAP((reference),(reference)) \
+    )
+/* does not return a value */
+#define INT_SET(reference,arch,valueref) \
+    (__builtin_constant_p(valueref) ? \
+        (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
+        (void)( \
+            ((reference) = (valueref)), \
+            ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
+        ) \
+    )
+/* does not return a value */
+#define INT_MOD_EXPR(reference,arch,code) \
+    (((arch) == ARCH_NOCONVERT) \
+        ? \
+            (void)((reference) code) \
+        : \
+            (void)( \
+                (reference) = INT_GET((reference),arch) , \
+                ((reference) code), \
+                INT_SET(reference, arch, reference) \
+            ) \
+    )
+/* does not return a value */
+#define INT_MOD(reference,arch,delta) \
+    (void)( \
+        INT_MOD_EXPR(reference,arch,+=(delta)) \
+    )
+/*
+ * INT_COPY - copy a value between two locations with the
+ *            _same architecture_ but _potentially different sizes_
+ *
+ *          if the types of the two parameters are equal or they are
+ *              in native architecture, a simple copy is done
+ *
+ *          otherwise, architecture conversions are done
+ *
+ */
+/* does not return a value */
+#define INT_COPY(dst,src,arch) \
+    ( \
+        ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
+            ? \
+                (void)((dst) = (src)) \
+            : \
+                INT_SET(dst, arch, INT_GET(src, arch)) \
+    )
+/*
+ * INT_XLATE - copy a value in either direction between two locations
+ *             with different architectures
+ *
+ *                  dir < 0     - copy from memory to buffer (native to arch)
+ *                  dir > 0     - copy from buffer to memory (arch to native)
+ */
+/* does not return a value */
+#define INT_XLATE(buf,mem,dir,arch) {\
+    ASSERT(dir); \
+    if (dir>0) { \
+        (mem)=INT_GET(buf, arch); \
+    } else { \
+        INT_SET(buf, arch, mem); \
+    } \
+}
+/*
+ * In directories inode numbers are stored as unaligned arrays of unsigned
+ * 8bit integers on disk.
+ *
+ * For v1 directories or v2 directories that contain inode numbers that
+ * do not fit into 32bit the array has eight members, but the first member
+ * is always zero:
+ *
+ *  |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7|
+ *
+ * For v2 directories that only contain entries with inode numbers that fit
+ * into 32bits a four-member array is used:
+ *
+ *  |24-31|16-23| 8-15| 0- 7|
+ */ 
+#define XFS_GET_DIR_INO4(di) \
+        (((u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+#define XFS_PUT_DIR_INO4(from, di) \
+do { \
+        (di).i[0] = (((from) & 0xff000000ULL) >> 24); \
+        (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \
+        (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \
+        (di).i[3] = ((from) & 0x000000ffULL); \
+} while (0)
+#define XFS_DI_HI(di) \
+        (((u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3]))
+#define XFS_DI_LO(di) \
+        (((u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7]))
+#define XFS_GET_DIR_INO8(di)        \
+        (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \
+         ((xfs_ino_t)XFS_DI_HI(di) << 32))
+#define XFS_PUT_DIR_INO8(from, di) \
+do { \
+        (di).i[0] = 0; \
+        (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \
+        (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \
+        (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \
+        (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \
+        (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \
+        (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \
+        (di).i[7] = ((from) & 0x00000000000000ffULL); \
+} while (0)
+        
+#endif  /* __XFS_ARCH_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
new file mode 100644
index 000000000000..ee8b5904ec7c
--- /dev/null
+++ b/fs/xfs/xfs_attr.c
@@ -0,0 +1,2660 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_quota.h"
+#include "xfs_rw.h"
+#include "xfs_trans_space.h"
+#include "xfs_acl.h"
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+/*
+ * Routines to manipulate out-of-line attribute values.
+ */
+STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
+#define ATTR_RMTVALUE_MAPSIZE   1       /* # of map entries at once */
+#if defined(XFS_ATTR_TRACE)
+ktrace_t *xfs_attr_trace_buf;
+#endif
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+int
+xfs_attr_fetch(xfs_inode_t *ip, char *name, int namelen,
+               char *value, int *valuelenp, int flags, struct cred *cred)
+{
+        xfs_da_args_t   args;
+        int             error;
+        if ((XFS_IFORK_Q(ip) == 0) ||
+            (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             ip->i_d.di_anextents == 0))
+                return(ENOATTR);
+        if (!(flags & (ATTR_KERNACCESS|ATTR_SECURE))) {
+                if ((error = xfs_iaccess(ip, S_IRUSR, cred)))
+                        return(XFS_ERROR(error));
+        }
+        /*
+         * Fill in the arg structure for this request.
+         */
+        memset((char *)&args, 0, sizeof(args));
+        args.name = name;
+        args.namelen = namelen;
+        args.value = value;
+        args.valuelen = *valuelenp;
+        args.flags = flags;
+        args.hashval = xfs_da_hashname(args.name, args.namelen);
+        args.dp = ip;
+        args.whichfork = XFS_ATTR_FORK;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (XFS_IFORK_Q(ip) == 0 ||
+            (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             ip->i_d.di_anextents == 0)) {
+                error = XFS_ERROR(ENOATTR);
+        } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+                error = xfs_attr_shortform_getvalue(&args);
+        } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+                error = xfs_attr_leaf_get(&args);
+        } else {
+                error = xfs_attr_node_get(&args);
+        }
+        /*
+         * Return the number of bytes in the value to the caller.
+         */
+        *valuelenp = args.valuelen;
+        if (error == EEXIST)
+                error = 0;
+        return(error);
+}
+int
+xfs_attr_get(bhv_desc_t *bdp, char *name, char *value, int *valuelenp,
+             int flags, struct cred *cred)
+{
+        xfs_inode_t     *ip = XFS_BHVTOI(bdp);
+        int             error, namelen;
+        XFS_STATS_INC(xs_attr_get);
+        if (!name)
+                return(EINVAL);
+        namelen = strlen(name);
+        if (namelen >= MAXNAMELEN)
+                return(EFAULT);         /* match IRIX behaviour */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return(EIO);
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return(error);
+}
+/*ARGSUSED*/
+int                                                             /* error */
+xfs_attr_set(bhv_desc_t *bdp, char *name, char *value, int valuelen, int flags,
+                     struct cred *cred)
+{
+        xfs_da_args_t   args;
+        xfs_inode_t     *dp;
+        xfs_fsblock_t   firstblock;
+        xfs_bmap_free_t flist;
+        int             error, err2, committed;
+        int             local, size;
+        uint            nblks;
+        xfs_mount_t     *mp;
+        int             rsvd = (flags & ATTR_ROOT) != 0;
+        int             namelen;
+        namelen = strlen(name);
+        if (namelen >= MAXNAMELEN)
+                return EFAULT;          /* match IRIX behaviour */
+        XFS_STATS_INC(xs_attr_set);
+        dp = XFS_BHVTOI(bdp);
+        mp = dp->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return (EIO);
+        xfs_ilock(dp, XFS_ILOCK_SHARED);
+        if (!(flags & ATTR_SECURE) &&
+             (error = xfs_iaccess(dp, S_IWUSR, cred))) {
+                xfs_iunlock(dp, XFS_ILOCK_SHARED);
+                return(XFS_ERROR(error));
+        }
+        xfs_iunlock(dp, XFS_ILOCK_SHARED);
+        /*
+         * Attach the dquots to the inode.
+         */
+        if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
+                return (error);
+        /*
+         * If the inode doesn't have an attribute fork, add one.
+         * (inode must not be locked when we call this routine)
+         */
+        if (XFS_IFORK_Q(dp) == 0) {
+                error = xfs_bmap_add_attrfork(dp, rsvd);
+                if (error)
+                        return(error);
+        }
+        /*
+         * Fill in the arg structure for this request.
+         */
+        memset((char *)&args, 0, sizeof(args));
+        args.name = name;
+        args.namelen = namelen;
+        args.value = value;
+        args.valuelen = valuelen;
+        args.flags = flags;
+        args.hashval = xfs_da_hashname(args.name, args.namelen);
+        args.dp = dp;
+        args.firstblock = &firstblock;
+        args.flist = &flist;
+        args.whichfork = XFS_ATTR_FORK;
+        args.oknoent = 1;
+        /* Determine space new attribute will use, and if it will be inline
+         * or out of line.
+         */
+        size = xfs_attr_leaf_newentsize(&args, mp->m_sb.sb_blocksize, &local);
+        nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+        if (local) {
+                if (size > (mp->m_sb.sb_blocksize >> 1)) {
+                        /* Double split possible */
+                        nblks <<= 1;
+                }
+        } else {
+                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
+                /* Out of line attribute, cannot double split, but make
+                 * room for the attribute value itself.
+                 */
+                nblks += dblocks;
+                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+        }
+        /* Size is now blocks for attribute data */
+        args.total = nblks;
+        /*
+         * Start our first transaction of the day.
+         *
+         * All future transactions during this code must be "chained" off
+         * this one via the trans_dup() call.  All transactions will contain
+         * the inode, and the inode will always be marked with trans_ihold().
+         * Since the inode will be locked in all transactions, we must log
+         * the inode in every transaction to let it float upward through
+         * the log.
+         */
+        args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+        /*
+         * Root fork attributes can use reserved data blocks for this
+         * operation if necessary
+         */
+        if (rsvd)
+                args.trans->t_flags |= XFS_TRANS_RESERVE;
+        if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
+                                      XFS_ATTRSET_LOG_RES(mp, nblks),
+                                      0, XFS_TRANS_PERM_LOG_RES,
+                                      XFS_ATTRSET_LOG_COUNT))) {
+                xfs_trans_cancel(args.trans, 0);
+                return(error);
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
+                         rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                                XFS_QMOPT_RES_REGBLKS);
+        if (error) {
+                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+                xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+                return (error);
+        }
+        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(args.trans, dp);
+        /*
+         * If the attribute list is non-existant or a shortform list,
+         * upgrade it to a single-leaf-block attribute list.
+         */
+        if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+            ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
+             (dp->i_d.di_anextents == 0))) {
+                /*
+                 * Build initial attribute list (if required).
+                 */
+                if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+                        (void)xfs_attr_shortform_create(&args);
+                /*
+                 * Try to add the attr to the attribute list in
+                 * the inode.
+                 */
+                error = xfs_attr_shortform_addname(&args);
+                if (error != ENOSPC) {
+                        /*
+                         * Commit the shortform mods, and we're done.
+                         * NOTE: this is also the error path (EEXIST, etc).
+                         */
+                        ASSERT(args.trans != NULL);
+                        /*
+                         * If this is a synchronous mount, make sure that
+                         * the transaction goes to disk before returning
+                         * to the user.
+                         */
+                        if (mp->m_flags & XFS_MOUNT_WSYNC) {
+                                xfs_trans_set_sync(args.trans);
+                        }
+                        err2 = xfs_trans_commit(args.trans,
+                                                 XFS_TRANS_RELEASE_LOG_RES,
+                                                 NULL);
+                        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+                        /*
+                         * Hit the inode change time.
+                         */
+                        if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+                        }
+                        return(error == 0 ? err2 : error);
+                }
+                /*
+                 * It won't fit in the shortform, transform to a leaf block.
+                 * GROT: another possible req'mt for a double-split btree op.
+                 */
+                XFS_BMAP_INIT(args.flist, args.firstblock);
+                error = xfs_attr_shortform_to_leaf(&args);
+                if (!error) {
+                        error = xfs_bmap_finish(&args.trans, args.flist,
+                                                *args.firstblock, &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args.trans = NULL;
+                        xfs_bmap_cancel(&flist);
+                        goto out;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed) {
+                        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(args.trans, dp);
+                }
+                /*
+                 * Commit the leaf transformation.  We'll need another (linked)
+                 * transaction to add the new attribute to the leaf.
+                 */
+                if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+                        goto out;
+        }
+        if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+                error = xfs_attr_leaf_addname(&args);
+        } else {
+                error = xfs_attr_node_addname(&args);
+        }
+        if (error) {
+                goto out;
+        }
+        /*
+         * If this is a synchronous mount, make sure that the
+         * transaction goes to disk before returning to the user.
+         */
+        if (mp->m_flags & XFS_MOUNT_WSYNC) {
+                xfs_trans_set_sync(args.trans);
+        }
+        /*
+         * Commit the last in the sequence of transactions.
+         */
+        xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
+                                 NULL);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        /*
+         * Hit the inode change time.
+         */
+        if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+        }
+        return(error);
+out:
+        if (args.trans)
+                xfs_trans_cancel(args.trans,
+                        XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return(error);
+}
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+/*ARGSUSED*/
+int                                                             /* error */
+xfs_attr_remove(bhv_desc_t *bdp, char *name, int flags, struct cred *cred)
+{
+        xfs_da_args_t       args;
+        xfs_inode_t         *dp;
+        xfs_fsblock_t       firstblock;
+        xfs_bmap_free_t     flist;
+        int                 error;
+        xfs_mount_t         *mp;
+        int                 namelen;
+        ASSERT(MAXNAMELEN-1<=0xff); /* length is stored in uint8 */
+        namelen = strlen(name);
+        if (namelen>=MAXNAMELEN)
+                return EFAULT; /* match irix behaviour */
+        XFS_STATS_INC(xs_attr_remove);
+        dp = XFS_BHVTOI(bdp);
+        mp = dp->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return (EIO);
+        xfs_ilock(dp, XFS_ILOCK_SHARED);
+        if (!(flags & ATTR_SECURE) &&
+             (error = xfs_iaccess(dp, S_IWUSR, cred))) {
+                xfs_iunlock(dp, XFS_ILOCK_SHARED);
+                return(XFS_ERROR(error));
+        } else if (XFS_IFORK_Q(dp) == 0 ||
+                   (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+                    dp->i_d.di_anextents == 0)) {
+                xfs_iunlock(dp, XFS_ILOCK_SHARED);
+                return(XFS_ERROR(ENOATTR));
+        }
+        xfs_iunlock(dp, XFS_ILOCK_SHARED);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        memset((char *)&args, 0, sizeof(args));
+        args.name = name;
+        args.namelen = namelen;
+        args.flags = flags;
+        args.hashval = xfs_da_hashname(args.name, args.namelen);
+        args.dp = dp;
+        args.firstblock = &firstblock;
+        args.flist = &flist;
+        args.total = 0;
+        args.whichfork = XFS_ATTR_FORK;
+        /*
+         * Attach the dquots to the inode.
+         */
+        if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
+                return (error);
+        /*
+         * Start our first transaction of the day.
+         *
+         * All future transactions during this code must be "chained" off
+         * this one via the trans_dup() call.  All transactions will contain
+         * the inode, and the inode will always be marked with trans_ihold().
+         * Since the inode will be locked in all transactions, we must log
+         * the inode in every transaction to let it float upward through
+         * the log.
+         */
+        args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+        /*
+         * Root fork attributes can use reserved data blocks for this
+         * operation if necessary
+         */
+        if (flags & ATTR_ROOT)
+                args.trans->t_flags |= XFS_TRANS_RESERVE;
+        if ((error = xfs_trans_reserve(args.trans,
+                                      XFS_ATTRRM_SPACE_RES(mp),
+                                      XFS_ATTRRM_LOG_RES(mp),
+                                      0, XFS_TRANS_PERM_LOG_RES,
+                                      XFS_ATTRRM_LOG_COUNT))) {
+                xfs_trans_cancel(args.trans, 0);
+                return(error);
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        /*
+         * No need to make quota reservations here. We expect to release some
+         * blocks not allocate in the common case.
+         */
+        xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(args.trans, dp);
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (XFS_IFORK_Q(dp) == 0 ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             dp->i_d.di_anextents == 0)) {
+                error = XFS_ERROR(ENOATTR);
+                goto out;
+        }
+        if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+                ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+                error = xfs_attr_shortform_remove(&args);
+                if (error) {
+                        goto out;
+                }
+        } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+                error = xfs_attr_leaf_removename(&args);
+        } else {
+                error = xfs_attr_node_removename(&args);
+        }
+        if (error) {
+                goto out;
+        }
+        /*
+         * If this is a synchronous mount, make sure that the
+         * transaction goes to disk before returning to the user.
+         */
+        if (mp->m_flags & XFS_MOUNT_WSYNC) {
+                xfs_trans_set_sync(args.trans);
+        }
+        /*
+         * Commit the last in the sequence of transactions.
+         */
+        xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+        error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
+                                 NULL);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        /*
+         * Hit the inode change time.
+         */
+        if (!error && (flags & ATTR_KERNOTIME) == 0) {
+                xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+        }
+        return(error);
+out:
+        if (args.trans)
+                xfs_trans_cancel(args.trans,
+                        XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return(error);
+}
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths.  Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
+                      attrlist_cursor_kern_t *cursor, struct cred *cred)
+{
+        xfs_attr_list_context_t context;
+        xfs_inode_t *dp;
+        int error;
+        XFS_STATS_INC(xs_attr_list);
+        /*
+         * Validate the cursor.
+         */
+        if (cursor->pad1 || cursor->pad2)
+                return(XFS_ERROR(EINVAL));
+        if ((cursor->initted == 0) &&
+            (cursor->hashval || cursor->blkno || cursor->offset))
+                return(XFS_ERROR(EINVAL));
+        /*
+         * Check for a properly aligned buffer.
+         */
+        if (((long)buffer) & (sizeof(int)-1))
+                return(XFS_ERROR(EFAULT));
+        if (flags & ATTR_KERNOVAL)
+                bufsize = 0;
+        /*
+         * Initialize the output buffer.
+         */
+        context.dp = dp = XFS_BHVTOI(bdp);
+        context.cursor = cursor;
+        context.count = 0;
+        context.dupcnt = 0;
+        context.resynch = 1;
+        context.flags = flags;
+        if (!(flags & ATTR_KERNAMELS)) {
+                context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
+                context.firstu = context.bufsize;
+                context.alist = (attrlist_t *)buffer;
+                context.alist->al_count = 0;
+                context.alist->al_more = 0;
+                context.alist->al_offset[0] = context.bufsize;
+        }
+        else {
+                context.bufsize = bufsize;
+                context.firstu = context.bufsize;
+                context.alist = (attrlist_t *)buffer;
+        }
+        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+                return (EIO);
+        xfs_ilock(dp, XFS_ILOCK_SHARED);
+        if (!(flags & ATTR_SECURE) &&
+             (error = xfs_iaccess(dp, S_IRUSR, cred))) {
+                xfs_iunlock(dp, XFS_ILOCK_SHARED);
+                return(XFS_ERROR(error));
+        }
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        xfs_attr_trace_l_c("syscall start", &context);
+        if (XFS_IFORK_Q(dp) == 0 ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             dp->i_d.di_anextents == 0)) {
+                error = 0;
+        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+                error = xfs_attr_shortform_list(&context);
+        } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+                error = xfs_attr_leaf_list(&context);
+        } else {
+                error = xfs_attr_node_list(&context);
+        }
+        xfs_iunlock(dp, XFS_ILOCK_SHARED);
+        xfs_attr_trace_l_c("syscall end", &context);
+        if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) {
+                ASSERT(error >= 0);
+        }
+        else {  /* must return negated buffer size or the error */
+                if (context.count < 0)
+                        error = XFS_ERROR(ERANGE);
+                else
+                        error = -context.count;
+        }
+        return(error);
+}
+int                                                             /* error */
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+        xfs_trans_t *trans;
+        xfs_mount_t *mp;
+        int error;
+        mp = dp->i_mount;
+        ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+        xfs_ilock(dp, XFS_ILOCK_SHARED);
+        if ((XFS_IFORK_Q(dp) == 0) ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             dp->i_d.di_anextents == 0)) {
+                xfs_iunlock(dp, XFS_ILOCK_SHARED);
+                return(0);
+        }
+        xfs_iunlock(dp, XFS_ILOCK_SHARED);
+        /*
+         * Start our first transaction of the day.
+         *
+         * All future transactions during this code must be "chained" off
+         * this one via the trans_dup() call.  All transactions will contain
+         * the inode, and the inode will always be marked with trans_ihold().
+         * Since the inode will be locked in all transactions, we must log
+         * the inode in every transaction to let it float upward through
+         * the log.
+         */
+        trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+        if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
+                                      XFS_TRANS_PERM_LOG_RES,
+                                      XFS_ATTRINVAL_LOG_COUNT))) {
+                xfs_trans_cancel(trans, 0);
+                return(error);
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        /*
+         * No need to make quota reservations here. We expect to release some
+         * blocks, not allocate, in the common case.
+         */
+        xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(trans, dp);
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if ((XFS_IFORK_Q(dp) == 0) ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+            (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+             dp->i_d.di_anextents == 0)) {
+                error = 0;
+                goto out;
+        }
+        error = xfs_attr_root_inactive(&trans, dp);
+        if (error)
+                goto out;
+        /*
+         * signal synchronous inactive transactions unless this
+         * is a synchronous mount filesystem in which case we
+         * know that we're here because we've been called out of
+         * xfs_inactive which means that the last reference is gone
+         * and the unlink transaction has already hit the disk so
+         * async inactive transactions are safe.
+         */
+        if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK,
+                                (!(mp->m_flags & XFS_MOUNT_WSYNC)
+                                 ? 1 : 0))))
+                goto out;
+        /*
+         * Commit the last in the sequence of transactions.
+         */
+        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+        error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES,
+                                 NULL);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return(error);
+out:
+        xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+        xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        return(error);
+}
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+        int newsize, retval;
+        retval = xfs_attr_shortform_lookup(args);
+        if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+                return(retval);
+        } else if (retval == EEXIST) {
+                if (args->flags & ATTR_CREATE)
+                        return(retval);
+                retval = xfs_attr_shortform_remove(args);
+                ASSERT(retval == 0);
+        }
+        newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+        newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+        if ((newsize <= XFS_IFORK_ASIZE(args->dp)) &&
+            (args->namelen < XFS_ATTR_SF_ENTSIZE_MAX) &&
+            (args->valuelen < XFS_ATTR_SF_ENTSIZE_MAX)) {
+                retval = xfs_attr_shortform_add(args);
+                ASSERT(retval == 0);
+        } else {
+                return(XFS_ERROR(ENOSPC));
+        }
+        return(0);
+}
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+        xfs_inode_t *dp;
+        xfs_dabuf_t *bp;
+        int retval, error, committed;
+        /*
+         * Read the (only) block in the attribute list in.
+         */
+        dp = args->dp;
+        args->blkno = 0;
+        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+                                             XFS_ATTR_FORK);
+        if (error)
+                return(error);
+        ASSERT(bp != NULL);
+        /*
+         * Look up the given attribute in the leaf block.  Figure out if
+         * the given flags produce an error or call for an atomic rename.
+         */
+        retval = xfs_attr_leaf_lookup_int(bp, args);
+        if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+                xfs_da_brelse(args->trans, bp);
+                return(retval);
+        } else if (retval == EEXIST) {
+                if (args->flags & ATTR_CREATE) {        /* pure create op */
+                        xfs_da_brelse(args->trans, bp);
+                        return(retval);
+                }
+                args->rename = 1;                       /* an atomic rename */
+                args->blkno2 = args->blkno;             /* set 2nd entry info*/
+                args->index2 = args->index;
+                args->rmtblkno2 = args->rmtblkno;
+                args->rmtblkcnt2 = args->rmtblkcnt;
+        }
+        /*
+         * Add the attribute to the leaf block, transitioning to a Btree
+         * if required.
+         */
+        retval = xfs_attr_leaf_add(bp, args);
+        xfs_da_buf_done(bp);
+        if (retval == ENOSPC) {
+                /*
+                 * Promote the attribute list to the Btree format, then
+                 * Commit that transaction so that the node_addname() call
+                 * can manage its own transactions.
+                 */
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                error = xfs_attr_leaf_to_node(args);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                *args->firstblock, &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return(error);
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed) {
+                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(args->trans, dp);
+                }
+                /*
+                 * Commit the current trans (including the inode) and start
+                 * a new one.
+                 */
+                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                        return (error);
+                /*
+                 * Fob the whole rest of the problem off on the Btree code.
+                 */
+                error = xfs_attr_node_addname(args);
+                return(error);
+        }
+        /*
+         * Commit the transaction that added the attr name so that
+         * later routines can manage their own transactions.
+         */
+        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                return (error);
+        /*
+         * If there was an out-of-line value, allocate the blocks we
+         * identified for its storage and copy the value.  This is done
+         * after we create the attribute so that we don't overflow the
+         * maximum size of a transaction and/or hit a deadlock.
+         */
+        if (args->rmtblkno > 0) {
+                error = xfs_attr_rmtval_set(args);
+                if (error)
+                        return(error);
+        }
+        /*
+         * If this is an atomic rename operation, we must "flip" the
+         * incomplete flags on the "new" and "old" attribute/value pairs
+         * so that one disappears and one appears atomically.  Then we
+         * must remove the "old" attribute/value pair.
+         */
+        if (args->rename) {
+                /*
+                 * In a separate transaction, set the incomplete flag on the
+                 * "old" attr and clear the incomplete flag on the "new" attr.
+                 */
+                error = xfs_attr_leaf_flipflags(args);
+                if (error)
+                        return(error);
+                /*
+                 * Dismantle the "old" attribute/value pair by removing
+                 * a "remote" value (if it exists).
+                 */
+                args->index = args->index2;
+                args->blkno = args->blkno2;
+                args->rmtblkno = args->rmtblkno2;
+                args->rmtblkcnt = args->rmtblkcnt2;
+                if (args->rmtblkno) {
+                        error = xfs_attr_rmtval_remove(args);
+                        if (error)
+                                return(error);
+                }
+                /*
+                 * Read in the block containing the "old" attr, then
+                 * remove the "old" attr from that block (neat, huh!)
+                 */
+                error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
+                                                     &bp, XFS_ATTR_FORK);
+                if (error)
+                        return(error);
+                ASSERT(bp != NULL);
+                (void)xfs_attr_leaf_remove(bp, args);
+                /*
+                 * If the result is small enough, shrink it all into the inode.
+                 */
+                if (xfs_attr_shortform_allfit(bp, dp)) {
+                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        error = xfs_attr_leaf_to_shortform(bp, args);
+                        /* bp is gone due to xfs_da_shrink_inode */
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        *args->firstblock,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                return(error);
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed) {
+                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ihold(args->trans, dp);
+                        }
+                } else
+                        xfs_da_buf_done(bp);
+                /*
+                 * Commit the remove and start the next trans in series.
+                 */
+                error = xfs_attr_rolltrans(&args->trans, dp);
+        } else if (args->rmtblkno > 0) {
+                /*
+                 * Added a "remote" value, just clear the incomplete flag.
+                 */
+                error = xfs_attr_leaf_clearflag(args);
+        }
+        return(error);
+}
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+        xfs_inode_t *dp;
+        xfs_dabuf_t *bp;
+        int committed;
+        int error;
+        /*
+         * Remove the attribute.
+         */
+        dp = args->dp;
+        args->blkno = 0;
+        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+                                             XFS_ATTR_FORK);
+        if (error) {
+                return(error);
+        }
+        ASSERT(bp != NULL);
+        error = xfs_attr_leaf_lookup_int(bp, args);
+        if (error == ENOATTR) {
+                xfs_da_brelse(args->trans, bp);
+                return(error);
+        }
+        (void)xfs_attr_leaf_remove(bp, args);
+        /*
+         * If the result is small enough, shrink it all into the inode.
+         */
+        if (xfs_attr_shortform_allfit(bp, dp)) {
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                error = xfs_attr_leaf_to_shortform(bp, args);
+                /* bp is gone due to xfs_da_shrink_inode */
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                *args->firstblock, &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return(error);
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed) {
+                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(args->trans, dp);
+                }
+        } else
+                xfs_da_buf_done(bp);
+        return(0);
+}
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+        xfs_dabuf_t *bp;
+        int error;
+        args->blkno = 0;
+        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+                                             XFS_ATTR_FORK);
+        if (error)
+                return(error);
+        ASSERT(bp != NULL);
+        error = xfs_attr_leaf_lookup_int(bp, args);
+        if (error != EEXIST)  {
+                xfs_da_brelse(args->trans, bp);
+                return(error);
+        }
+        error = xfs_attr_leaf_getvalue(bp, args);
+        xfs_da_brelse(args->trans, bp);
+        if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+                error = xfs_attr_rmtval_get(args);
+        }
+        return(error);
+}
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+        xfs_attr_leafblock_t *leaf;
+        int error;
+        xfs_dabuf_t *bp;
+        context->cursor->blkno = 0;
+        error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+        if (error)
+                return(error);
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                != XFS_ATTR_LEAF_MAGIC)) {
+                XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
+                                     context->dp->i_mount, leaf);
+                xfs_da_brelse(NULL, bp);
+                return(XFS_ERROR(EFSCORRUPTED));
+        }
+        (void)xfs_attr_leaf_list_int(bp, context);
+        xfs_da_brelse(NULL, bp);
+        return(0);
+}
+/*========================================================================
+ * External routines when attribute list size > XFS_LBSIZE(mp).
+ *========================================================================*/
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        xfs_inode_t *dp;
+        xfs_mount_t *mp;
+        int committed, retval, error;
+        /*
+         * Fill in bucket of arguments/results/context to carry around.
+         */
+        dp = args->dp;
+        mp = dp->i_mount;
+restart:
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = mp;
+        state->blocksize = state->mp->m_sb.sb_blocksize;
+        state->node_ents = state->mp->m_attr_node_ents;
+        /*
+         * Search to see if name already exists, and get back a pointer
+         * to where it should go.
+         */
+        error = xfs_da_node_lookup_int(state, &retval);
+        if (error)
+                goto out;
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+        if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+                goto out;
+        } else if (retval == EEXIST) {
+                if (args->flags & ATTR_CREATE)
+                        goto out;
+                args->rename = 1;                       /* atomic rename op */
+                args->blkno2 = args->blkno;             /* set 2nd entry info*/
+                args->index2 = args->index;
+                args->rmtblkno2 = args->rmtblkno;
+                args->rmtblkcnt2 = args->rmtblkcnt;
+                args->rmtblkno = 0;
+                args->rmtblkcnt = 0;
+        }
+        retval = xfs_attr_leaf_add(blk->bp, state->args);
+        if (retval == ENOSPC) {
+                if (state->path.active == 1) {
+                        /*
+                         * Its really a single leaf node, but it had
+                         * out-of-line values so it looked like it *might*
+                         * have been a b-tree.
+                         */
+                        xfs_da_state_free(state);
+                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        error = xfs_attr_leaf_to_node(args);
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        *args->firstblock,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                goto out;
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed) {
+                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ihold(args->trans, dp);
+                        }
+                        /*
+                         * Commit the node conversion and start the next
+                         * trans in the chain.
+                         */
+                        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                                goto out;
+                        goto restart;
+                }
+                /*
+                 * Split as many Btree elements as required.
+                 * This code tracks the new and old attr's location
+                 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+                 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+                 */
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                error = xfs_da_split(state);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                *args->firstblock, &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        goto out;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed) {
+                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(args->trans, dp);
+                }
+        } else {
+                /*
+                 * Addition succeeded, update Btree hashvals.
+                 */
+                xfs_da_fixhashpath(state, &state->path);
+        }
+        /*
+         * Kill the state structure, we're done with it and need to
+         * allow the buffers to come back later.
+         */
+        xfs_da_state_free(state);
+        state = NULL;
+        /*
+         * Commit the leaf addition or btree split and start the next
+         * trans in the chain.
+         */
+        if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                goto out;
+        /*
+         * If there was an out-of-line value, allocate the blocks we
+         * identified for its storage and copy the value.  This is done
+         * after we create the attribute so that we don't overflow the
+         * maximum size of a transaction and/or hit a deadlock.
+         */
+        if (args->rmtblkno > 0) {
+                error = xfs_attr_rmtval_set(args);
+                if (error)
+                        return(error);
+        }
+        /*
+         * If this is an atomic rename operation, we must "flip" the
+         * incomplete flags on the "new" and "old" attribute/value pairs
+         * so that one disappears and one appears atomically.  Then we
+         * must remove the "old" attribute/value pair.
+         */
+        if (args->rename) {
+                /*
+                 * In a separate transaction, set the incomplete flag on the
+                 * "old" attr and clear the incomplete flag on the "new" attr.
+                 */
+                error = xfs_attr_leaf_flipflags(args);
+                if (error)
+                        goto out;
+                /*
+                 * Dismantle the "old" attribute/value pair by removing
+                 * a "remote" value (if it exists).
+                 */
+                args->index = args->index2;
+                args->blkno = args->blkno2;
+                args->rmtblkno = args->rmtblkno2;
+                args->rmtblkcnt = args->rmtblkcnt2;
+                if (args->rmtblkno) {
+                        error = xfs_attr_rmtval_remove(args);
+                        if (error)
+                                return(error);
+                }
+                /*
+                 * Re-find the "old" attribute entry after any split ops.
+                 * The INCOMPLETE flag means that we will find the "old"
+                 * attr, not the "new" one.
+                 */
+                args->flags |= XFS_ATTR_INCOMPLETE;
+                state = xfs_da_state_alloc();
+                state->args = args;
+                state->mp = mp;
+                state->blocksize = state->mp->m_sb.sb_blocksize;
+                state->node_ents = state->mp->m_attr_node_ents;
+                state->inleaf = 0;
+                error = xfs_da_node_lookup_int(state, &retval);
+                if (error)
+                        goto out;
+                /*
+                 * Remove the name and update the hashvals in the tree.
+                 */
+                blk = &state->path.blk[ state->path.active-1 ];
+                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+                error = xfs_attr_leaf_remove(blk->bp, args);
+                xfs_da_fixhashpath(state, &state->path);
+                /*
+                 * Check to see if the tree needs to be collapsed.
+                 */
+                if (retval && (state->path.active > 1)) {
+                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        error = xfs_da_join(state);
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        *args->firstblock,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                goto out;
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed) {
+                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ihold(args->trans, dp);
+                        }
+                }
+                /*
+                 * Commit and start the next trans in the chain.
+                 */
+                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                        goto out;
+        } else if (args->rmtblkno > 0) {
+                /*
+                 * Added a "remote" value, just clear the incomplete flag.
+                 */
+                error = xfs_attr_leaf_clearflag(args);
+                if (error)
+                        goto out;
+        }
+        retval = error = 0;
+out:
+        if (state)
+                xfs_da_state_free(state);
+        if (error)
+                return(error);
+        return(retval);
+}
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        xfs_inode_t *dp;
+        xfs_dabuf_t *bp;
+        int retval, error, committed;
+        /*
+         * Tie a string around our finger to remind us where we are.
+         */
+        dp = args->dp;
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = dp->i_mount;
+        state->blocksize = state->mp->m_sb.sb_blocksize;
+        state->node_ents = state->mp->m_attr_node_ents;
+        /*
+         * Search to see if name exists, and get back a pointer to it.
+         */
+        error = xfs_da_node_lookup_int(state, &retval);
+        if (error || (retval != EEXIST)) {
+                if (error == 0)
+                        error = retval;
+                goto out;
+        }
+        /*
+         * If there is an out-of-line value, de-allocate the blocks.
+         * This is done before we remove the attribute so that we don't
+         * overflow the maximum size of a transaction and/or hit a deadlock.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->bp != NULL);
+        ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+        if (args->rmtblkno > 0) {
+                /*
+                 * Fill in disk block numbers in the state structure
+                 * so that we can get the buffers back after we commit
+                 * several transactions in the following calls.
+                 */
+                error = xfs_attr_fillstate(state);
+                if (error)
+                        goto out;
+                /*
+                 * Mark the attribute as INCOMPLETE, then bunmapi() the
+                 * remote value.
+                 */
+                error = xfs_attr_leaf_setflag(args);
+                if (error)
+                        goto out;
+                error = xfs_attr_rmtval_remove(args);
+                if (error)
+                        goto out;
+                /*
+                 * Refill the state structure with buffers, the prior calls
+                 * released our buffers.
+                 */
+                error = xfs_attr_refillstate(state);
+                if (error)
+                        goto out;
+        }
+        /*
+         * Remove the name and update the hashvals in the tree.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+        retval = xfs_attr_leaf_remove(blk->bp, args);
+        xfs_da_fixhashpath(state, &state->path);
+        /*
+         * Check to see if the tree needs to be collapsed.
+         */
+        if (retval && (state->path.active > 1)) {
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                error = xfs_da_join(state);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                *args->firstblock, &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        goto out;
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed) {
+                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(args->trans, dp);
+                }
+                /*
+                 * Commit the Btree join operation and start a new trans.
+                 */
+                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                        goto out;
+        }
+        /*
+         * If the result is small enough, push it all into the inode.
+         */
+        if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+                /*
+                 * Have to get rid of the copy of this dabuf in the state.
+                 */
+                ASSERT(state->path.active == 1);
+                ASSERT(state->path.blk[0].bp);
+                xfs_da_buf_done(state->path.blk[0].bp);
+                state->path.blk[0].bp = NULL;
+                error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+                                                     XFS_ATTR_FORK);
+                if (error)
+                        goto out;
+                ASSERT(INT_GET(((xfs_attr_leafblock_t *)
+                                      bp->data)->hdr.info.magic, ARCH_CONVERT)
+                                                       == XFS_ATTR_LEAF_MAGIC);
+                if (xfs_attr_shortform_allfit(bp, dp)) {
+                        XFS_BMAP_INIT(args->flist, args->firstblock);
+                        error = xfs_attr_leaf_to_shortform(bp, args);
+                        /* bp is gone due to xfs_da_shrink_inode */
+                        if (!error) {
+                                error = xfs_bmap_finish(&args->trans,
+                                                        args->flist,
+                                                        *args->firstblock,
+                                                        &committed);
+                        }
+                        if (error) {
+                                ASSERT(committed);
+                                args->trans = NULL;
+                                xfs_bmap_cancel(args->flist);
+                                goto out;
+                        }
+                        /*
+                         * bmap_finish() may have committed the last trans
+                         * and started a new one.  We need the inode to be
+                         * in all transactions.
+                         */
+                        if (committed) {
+                                xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                                xfs_trans_ihold(args->trans, dp);
+                        }
+                } else
+                        xfs_da_brelse(args->trans, bp);
+        }
+        error = 0;
+out:
+        xfs_da_state_free(state);
+        return(error);
+}
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commit's has released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+        xfs_da_state_path_t *path;
+        xfs_da_state_blk_t *blk;
+        int level;
+        /*
+         * Roll down the "path" in the state structure, storing the on-disk
+         * block number for those buffers in the "path".
+         */
+        path = &state->path;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->bp) {
+                        blk->disk_blkno = xfs_da_blkno(blk->bp);
+                        xfs_da_buf_done(blk->bp);
+                        blk->bp = NULL;
+                } else {
+                        blk->disk_blkno = 0;
+                }
+        }
+        /*
+         * Roll down the "altpath" in the state structure, storing the on-disk
+         * block number for those buffers in the "altpath".
+         */
+        path = &state->altpath;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->bp) {
+                        blk->disk_blkno = xfs_da_blkno(blk->bp);
+                        xfs_da_buf_done(blk->bp);
+                        blk->bp = NULL;
+                } else {
+                        blk->disk_blkno = 0;
+                }
+        }
+        return(0);
+}
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commit's has released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+        xfs_da_state_path_t *path;
+        xfs_da_state_blk_t *blk;
+        int level, error;
+        /*
+         * Roll down the "path" in the state structure, storing the on-disk
+         * block number for those buffers in the "path".
+         */
+        path = &state->path;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->disk_blkno) {
+                        error = xfs_da_read_buf(state->args->trans,
+                                                state->args->dp,
+                                                blk->blkno, blk->disk_blkno,
+                                                &blk->bp, XFS_ATTR_FORK);
+                        if (error)
+                                return(error);
+                } else {
+                        blk->bp = NULL;
+                }
+        }
+        /*
+         * Roll down the "altpath" in the state structure, storing the on-disk
+         * block number for those buffers in the "altpath".
+         */
+        path = &state->altpath;
+        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+                if (blk->disk_blkno) {
+                        error = xfs_da_read_buf(state->args->trans,
+                                                state->args->dp,
+                                                blk->blkno, blk->disk_blkno,
+                                                &blk->bp, XFS_ATTR_FORK);
+                        if (error)
+                                return(error);
+                } else {
+                        blk->bp = NULL;
+                }
+        }
+        return(0);
+}
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        int error, retval;
+        int i;
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_sb.sb_blocksize;
+        state->node_ents = state->mp->m_attr_node_ents;
+        /*
+         * Search to see if name exists, and get back a pointer to it.
+         */
+        error = xfs_da_node_lookup_int(state, &retval);
+        if (error) {
+                retval = error;
+        } else if (retval == EEXIST) {
+                blk = &state->path.blk[ state->path.active-1 ];
+                ASSERT(blk->bp != NULL);
+                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+                /*
+                 * Get the value, local or "remote"
+                 */
+                retval = xfs_attr_leaf_getvalue(blk->bp, args);
+                if (!retval && (args->rmtblkno > 0)
+                    && !(args->flags & ATTR_KERNOVAL)) {
+                        retval = xfs_attr_rmtval_get(args);
+                }
+        }
+        /*
+         * If not in a transaction, we have to release all the buffers.
+         */
+        for (i = 0; i < state->path.active; i++) {
+                xfs_da_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return(retval);
+}
+STATIC int                                                      /* error */
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+        attrlist_cursor_kern_t *cursor;
+        xfs_attr_leafblock_t *leaf;
+        xfs_da_intnode_t *node;
+        xfs_da_node_entry_t *btree;
+        int error, i;
+        xfs_dabuf_t *bp;
+        cursor = context->cursor;
+        cursor->initted = 1;
+        /*
+         * Do all sorts of validation on the passed-in cursor structure.
+         * If anything is amiss, ignore the cursor and look up the hashval
+         * starting from the btree root.
+         */
+        bp = NULL;
+        if (cursor->blkno > 0) {
+                error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+                                              &bp, XFS_ATTR_FORK);
+                if ((error != 0) && (error != EFSCORRUPTED))
+                        return(error);
+                if (bp) {
+                        node = bp->data;
+                        switch (INT_GET(node->hdr.info.magic, ARCH_CONVERT)) {
+                        case XFS_DA_NODE_MAGIC:
+                                xfs_attr_trace_l_cn("wrong blk", context, node);
+                                xfs_da_brelse(NULL, bp);
+                                bp = NULL;
+                                break;
+                        case XFS_ATTR_LEAF_MAGIC:
+                                leaf = bp->data;
+                                if (cursor->hashval >
+                                    INT_GET(leaf->entries[
+                                         INT_GET(leaf->hdr.count,
+                                                ARCH_CONVERT)-1].hashval,
+                                                        ARCH_CONVERT)) {
+                                        xfs_attr_trace_l_cl("wrong blk",
+                                                           context, leaf);
+                                        xfs_da_brelse(NULL, bp);
+                                        bp = NULL;
+                                } else if (cursor->hashval <=
+                                             INT_GET(leaf->entries[0].hashval,
+                                                        ARCH_CONVERT)) {
+                                        xfs_attr_trace_l_cl("maybe wrong blk",
+                                                           context, leaf);
+                                        xfs_da_brelse(NULL, bp);
+                                        bp = NULL;
+                                }
+                                break;
+                        default:
+                                xfs_attr_trace_l_c("wrong blk - ??", context);
+                                xfs_da_brelse(NULL, bp);
+                                bp = NULL;
+                        }
+                }
+        }
+        /*
+         * We did not find what we expected given the cursor's contents,
+         * so we start from the top and work down based on the hash value.
+         * Note that start of node block is same as start of leaf block.
+         */
+        if (bp == NULL) {
+                cursor->blkno = 0;
+                for (;;) {
+                        error = xfs_da_read_buf(NULL, context->dp,
+                                                      cursor->blkno, -1, &bp,
+                                                      XFS_ATTR_FORK);
+                        if (error)
+                                return(error);
+                        if (unlikely(bp == NULL)) {
+                                XFS_ERROR_REPORT("xfs_attr_node_list(2)",
+                                                 XFS_ERRLEVEL_LOW,
+                                                 context->dp->i_mount);
+                                return(XFS_ERROR(EFSCORRUPTED));
+                        }
+                        node = bp->data;
+                        if (INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+                                                        == XFS_ATTR_LEAF_MAGIC)
+                                break;
+                        if (unlikely(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+                                                        != XFS_DA_NODE_MAGIC)) {
+                                XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
+                                                     XFS_ERRLEVEL_LOW,
+                                                     context->dp->i_mount,
+                                                     node);
+                                xfs_da_brelse(NULL, bp);
+                                return(XFS_ERROR(EFSCORRUPTED));
+                        }
+                        btree = node->btree;
+                        for (i = 0;
+                                i < INT_GET(node->hdr.count, ARCH_CONVERT);
+                                                                btree++, i++) {
+                                if (cursor->hashval
+                                                <= INT_GET(btree->hashval,
+                                                            ARCH_CONVERT)) {
+                                        cursor->blkno = INT_GET(btree->before, ARCH_CONVERT);
+                                        xfs_attr_trace_l_cb("descending",
+                                                            context, btree);
+                                        break;
+                                }
+                        }
+                        if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
+                                xfs_da_brelse(NULL, bp);
+                                return(0);
+                        }
+                        xfs_da_brelse(NULL, bp);
+                }
+        }
+        ASSERT(bp != NULL);
+        /*
+         * Roll upward through the blocks, processing each leaf block in
+         * order.  As long as there is space in the result buffer, keep
+         * adding the information.
+         */
+        for (;;) {
+                leaf = bp->data;
+                if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                != XFS_ATTR_LEAF_MAGIC)) {
+                        XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
+                                             XFS_ERRLEVEL_LOW,
+                                             context->dp->i_mount, leaf);
+                        xfs_da_brelse(NULL, bp);
+                        return(XFS_ERROR(EFSCORRUPTED));
+                }
+                error = xfs_attr_leaf_list_int(bp, context);
+                if (error || !leaf->hdr.info.forw)
+                        break;  /* not really an error, buffer full or EOF */
+                cursor->blkno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT);
+                xfs_da_brelse(NULL, bp);
+                error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+                                              &bp, XFS_ATTR_FORK);
+                if (error)
+                        return(error);
+                if (unlikely((bp == NULL))) {
+                        XFS_ERROR_REPORT("xfs_attr_node_list(5)",
+                                         XFS_ERRLEVEL_LOW,
+                                         context->dp->i_mount);
+                        return(XFS_ERROR(EFSCORRUPTED));
+                }
+        }
+        xfs_da_brelse(NULL, bp);
+        return(0);
+}
+/*========================================================================
+ * External routines for manipulating out-of-line attribute values.
+ *========================================================================*/
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+STATIC int
+xfs_attr_rmtval_get(xfs_da_args_t *args)
+{
+        xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
+        xfs_mount_t *mp;
+        xfs_daddr_t dblkno;
+        xfs_caddr_t dst;
+        xfs_buf_t *bp;
+        int nmap, error, tmp, valuelen, blkcnt, i;
+        xfs_dablk_t lblkno;
+        ASSERT(!(args->flags & ATTR_KERNOVAL));
+        mp = args->dp->i_mount;
+        dst = args->value;
+        valuelen = args->valuelen;
+        lblkno = args->rmtblkno;
+        while (valuelen > 0) {
+                nmap = ATTR_RMTVALUE_MAPSIZE;
+                error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
+                                  args->rmtblkcnt,
+                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                  NULL, 0, map, &nmap, NULL);
+                if (error)
+                        return(error);
+                ASSERT(nmap >= 1);
+                for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+                        ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+                               (map[i].br_startblock != HOLESTARTBLOCK));
+                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                        error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
+                                             blkcnt, XFS_BUF_LOCK, &bp);
+                        if (error)
+                                return(error);
+                        tmp = (valuelen < XFS_BUF_SIZE(bp))
+                                ? valuelen : XFS_BUF_SIZE(bp);
+                        xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
+                        xfs_buf_relse(bp);
+                        dst += tmp;
+                        valuelen -= tmp;
+                        lblkno += map[i].br_blockcount;
+                }
+        }
+        ASSERT(valuelen == 0);
+        return(0);
+}
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+STATIC int
+xfs_attr_rmtval_set(xfs_da_args_t *args)
+{
+        xfs_mount_t *mp;
+        xfs_fileoff_t lfileoff;
+        xfs_inode_t *dp;
+        xfs_bmbt_irec_t map;
+        xfs_daddr_t dblkno;
+        xfs_caddr_t src;
+        xfs_buf_t *bp;
+        xfs_dablk_t lblkno;
+        int blkcnt, valuelen, nmap, error, tmp, committed;
+        dp = args->dp;
+        mp = dp->i_mount;
+        src = args->value;
+        /*
+         * Find a "hole" in the attribute address space large enough for
+         * us to drop the new attribute's value into.
+         */
+        blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+        lfileoff = 0;
+        error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+                                                   XFS_ATTR_FORK);
+        if (error) {
+                return(error);
+        }
+        args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+        args->rmtblkcnt = blkcnt;
+        /*
+         * Roll through the "value", allocating blocks on disk as required.
+         */
+        while (blkcnt > 0) {
+                /*
+                 * Allocate a single extent, up to the size of the value.
+                 */
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                nmap = 1;
+                error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+                                  blkcnt,
+                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
+                                                        XFS_BMAPI_WRITE,
+                                  args->firstblock, args->total, &map, &nmap,
+                                  args->flist);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                *args->firstblock, &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return(error);
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed) {
+                        xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(args->trans, dp);
+                }
+                ASSERT(nmap == 1);
+                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                       (map.br_startblock != HOLESTARTBLOCK));
+                lblkno += map.br_blockcount;
+                blkcnt -= map.br_blockcount;
+                /*
+                 * Start the next trans in the chain.
+                 */
+                if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+                        return (error);
+        }
+        /*
+         * Roll through the "value", copying the attribute value to the
+         * already-allocated blocks.  Blocks are written synchronously
+         * so that we can know they are all on disk before we turn off
+         * the INCOMPLETE flag.
+         */
+        lblkno = args->rmtblkno;
+        valuelen = args->valuelen;
+        while (valuelen > 0) {
+                /*
+                 * Try to remember where we decided to put the value.
+                 */
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                nmap = 1;
+                error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
+                                  args->rmtblkcnt,
+                                  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                  args->firstblock, 0, &map, &nmap, NULL);
+                if (error) {
+                        return(error);
+                }
+                ASSERT(nmap == 1);
+                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                       (map.br_startblock != HOLESTARTBLOCK));
+                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
+                                                        blkcnt, XFS_BUF_LOCK);
+                ASSERT(bp);
+                ASSERT(!XFS_BUF_GETERROR(bp));
+                tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
+                                                        XFS_BUF_SIZE(bp);
+                xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
+                if (tmp < XFS_BUF_SIZE(bp))
+                        xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+                if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
+                        return (error);
+                }
+                src += tmp;
+                valuelen -= tmp;
+                lblkno += map.br_blockcount;
+        }
+        ASSERT(valuelen == 0);
+        return(0);
+}
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+STATIC int
+xfs_attr_rmtval_remove(xfs_da_args_t *args)
+{
+        xfs_mount_t *mp;
+        xfs_bmbt_irec_t map;
+        xfs_buf_t *bp;
+        xfs_daddr_t dblkno;
+        xfs_dablk_t lblkno;
+        int valuelen, blkcnt, nmap, error, done, committed;
+        mp = args->dp->i_mount;
+        /*
+         * Roll through the "value", invalidating the attribute value's
+         * blocks.
+         */
+        lblkno = args->rmtblkno;
+        valuelen = args->rmtblkcnt;
+        while (valuelen > 0) {
+                /*
+                 * Try to remember where we decided to put the value.
+                 */
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                nmap = 1;
+                error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
+                                        args->rmtblkcnt,
+                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                        args->firstblock, 0, &map, &nmap,
+                                        args->flist);
+                if (error) {
+                        return(error);
+                }
+                ASSERT(nmap == 1);
+                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+                       (map.br_startblock != HOLESTARTBLOCK));
+                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+                /*
+                 * If the "remote" value is in the cache, remove it.
+                 */
+                bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt,
+                                XFS_INCORE_TRYLOCK);
+                if (bp) {
+                        XFS_BUF_STALE(bp);
+                        XFS_BUF_UNDELAYWRITE(bp);
+                        xfs_buf_relse(bp);
+                        bp = NULL;
+                }
+                valuelen -= map.br_blockcount;
+                lblkno += map.br_blockcount;
+        }
+        /*
+         * Keep de-allocating extents until the remote-value region is gone.
+         */
+        lblkno = args->rmtblkno;
+        blkcnt = args->rmtblkcnt;
+        done = 0;
+        while (!done) {
+                XFS_BMAP_INIT(args->flist, args->firstblock);
+                error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+                                    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                    1, args->firstblock, args->flist, &done);
+                if (!error) {
+                        error = xfs_bmap_finish(&args->trans, args->flist,
+                                                *args->firstblock, &committed);
+                }
+                if (error) {
+                        ASSERT(committed);
+                        args->trans = NULL;
+                        xfs_bmap_cancel(args->flist);
+                        return(error);
+                }
+                /*
+                 * bmap_finish() may have committed the last trans and started
+                 * a new one.  We need the inode to be in all transactions.
+                 */
+                if (committed) {
+                        xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(args->trans, args->dp);
+                }
+                /*
+                 * Close out trans and start the next one in the chain.
+                 */
+                if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+                        return (error);
+        }
+        return(0);
+}
+#if defined(XFS_ATTR_TRACE)
+/*
+ * Add a trace buffer entry for an attr_list context structure.
+ */
+void
+xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
+{
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
+                (__psunsigned_t)context->dp,
+                (__psunsigned_t)context->cursor->hashval,
+                (__psunsigned_t)context->cursor->blkno,
+                (__psunsigned_t)context->cursor->offset,
+                (__psunsigned_t)context->alist,
+                (__psunsigned_t)context->bufsize,
+                (__psunsigned_t)context->count,
+                (__psunsigned_t)context->firstu,
+                (__psunsigned_t)
+                        ((context->count > 0) &&
+                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
+                                ? (ATTR_ENTRY(context->alist,
+                                              context->count-1)->a_valuelen)
+                                : 0,
+                (__psunsigned_t)context->dupcnt,
+                (__psunsigned_t)context->flags,
+                (__psunsigned_t)NULL,
+                (__psunsigned_t)NULL,
+                (__psunsigned_t)NULL);
+}
+/*
+ * Add a trace buffer entry for a context structure and a Btree node.
+ */
+void
+xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
+                         struct xfs_da_intnode *node)
+{
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
+                (__psunsigned_t)context->dp,
+                (__psunsigned_t)context->cursor->hashval,
+                (__psunsigned_t)context->cursor->blkno,
+                (__psunsigned_t)context->cursor->offset,
+                (__psunsigned_t)context->alist,
+                (__psunsigned_t)context->bufsize,
+                (__psunsigned_t)context->count,
+                (__psunsigned_t)context->firstu,
+                (__psunsigned_t)
+                        ((context->count > 0) &&
+                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
+                                ? (ATTR_ENTRY(context->alist,
+                                              context->count-1)->a_valuelen)
+                                : 0,
+                (__psunsigned_t)context->dupcnt,
+                (__psunsigned_t)context->flags,
+                (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT),
+                (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT),
+                (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+/*
+ * Add a trace buffer entry for a context structure and a Btree element.
+ */
+void
+xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
+                          struct xfs_da_node_entry *btree)
+{
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
+                (__psunsigned_t)context->dp,
+                (__psunsigned_t)context->cursor->hashval,
+                (__psunsigned_t)context->cursor->blkno,
+                (__psunsigned_t)context->cursor->offset,
+                (__psunsigned_t)context->alist,
+                (__psunsigned_t)context->bufsize,
+                (__psunsigned_t)context->count,
+                (__psunsigned_t)context->firstu,
+                (__psunsigned_t)
+                        ((context->count > 0) &&
+                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
+                                ? (ATTR_ENTRY(context->alist,
+                                              context->count-1)->a_valuelen)
+                                : 0,
+                (__psunsigned_t)context->dupcnt,
+                (__psunsigned_t)context->flags,
+                (__psunsigned_t)INT_GET(btree->hashval, ARCH_CONVERT),
+                (__psunsigned_t)INT_GET(btree->before, ARCH_CONVERT),
+                (__psunsigned_t)NULL);
+}
+/*
+ * Add a trace buffer entry for a context structure and a leaf block.
+ */
+void
+xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
+                              struct xfs_attr_leafblock *leaf)
+{
+        xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
+                (__psunsigned_t)context->dp,
+                (__psunsigned_t)context->cursor->hashval,
+                (__psunsigned_t)context->cursor->blkno,
+                (__psunsigned_t)context->cursor->offset,
+                (__psunsigned_t)context->alist,
+                (__psunsigned_t)context->bufsize,
+                (__psunsigned_t)context->count,
+                (__psunsigned_t)context->firstu,
+                (__psunsigned_t)
+                        ((context->count > 0) &&
+                        !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
+                                ? (ATTR_ENTRY(context->alist,
+                                              context->count-1)->a_valuelen)
+                                : 0,
+                (__psunsigned_t)context->dupcnt,
+                (__psunsigned_t)context->flags,
+                (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT),
+                (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
+                (__psunsigned_t)INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+void
+xfs_attr_trace_enter(int type, char *where,
+                         __psunsigned_t a2, __psunsigned_t a3,
+                         __psunsigned_t a4, __psunsigned_t a5,
+                         __psunsigned_t a6, __psunsigned_t a7,
+                         __psunsigned_t a8, __psunsigned_t a9,
+                         __psunsigned_t a10, __psunsigned_t a11,
+                         __psunsigned_t a12, __psunsigned_t a13,
+                         __psunsigned_t a14, __psunsigned_t a15)
+{
+        ASSERT(xfs_attr_trace_buf);
+        ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
+                                         (void *)where,
+                                         (void *)a2,  (void *)a3,  (void *)a4,
+                                         (void *)a5,  (void *)a6,  (void *)a7,
+                                         (void *)a8,  (void *)a9,  (void *)a10,
+                                         (void *)a11, (void *)a12, (void *)a13,
+                                         (void *)a14, (void *)a15);
+}
+#endif  /* XFS_ATTR_TRACE */
+/*========================================================================
+ * System (pseudo) namespace attribute interface routines.
+ *========================================================================*/
+STATIC int
+posix_acl_access_set(
+        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+{
+        return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
+}
+STATIC int
+posix_acl_access_remove(
+        struct vnode *vp, char *name, int xflags)
+{
+        return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+}
+STATIC int
+posix_acl_access_get(
+        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+{
+        return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
+}
+STATIC int
+posix_acl_access_exists(
+        vnode_t *vp)
+{
+        return xfs_acl_vhasacl_access(vp);
+}
+STATIC int
+posix_acl_default_set(
+        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+{
+        return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
+}
+STATIC int
+posix_acl_default_get(
+        vnode_t *vp, char *name, void *data, size_t size, int xflags)
+{
+        return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
+}
+STATIC int
+posix_acl_default_remove(
+        struct vnode *vp, char *name, int xflags)
+{
+        return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
+}
+STATIC int
+posix_acl_default_exists(
+        vnode_t *vp)
+{
+        return xfs_acl_vhasacl_default(vp);
+}
+struct attrnames posix_acl_access = {
+        .attr_name      = "posix_acl_access",
+        .attr_namelen   = sizeof("posix_acl_access") - 1,
+        .attr_get       = posix_acl_access_get,
+        .attr_set       = posix_acl_access_set,
+        .attr_remove    = posix_acl_access_remove,
+        .attr_exists    = posix_acl_access_exists,
+};
+struct attrnames posix_acl_default = {
+        .attr_name      = "posix_acl_default",
+        .attr_namelen   = sizeof("posix_acl_default") - 1,
+        .attr_get       = posix_acl_default_get,
+        .attr_set       = posix_acl_default_set,
+        .attr_remove    = posix_acl_default_remove,
+        .attr_exists    = posix_acl_default_exists,
+};
+struct attrnames *attr_system_names[] =
+        { &posix_acl_access, &posix_acl_default };
+/*========================================================================
+ * Namespace-prefix-style attribute name interface routines.
+ *========================================================================*/
+STATIC int
+attr_generic_set(
+        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+{
+        int     error;
+        VOP_ATTR_SET(vp, name, data, size, xflags, NULL, error);
+        return -error;
+}
+STATIC int
+attr_generic_get(
+        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+{
+        int     error, asize = size;
+        VOP_ATTR_GET(vp, name, data, &asize, xflags, NULL, error);
+        if (!error)
+                return asize;
+        return -error;
+}
+STATIC int
+attr_generic_remove(
+        struct vnode *vp, char *name, int xflags)
+{
+        int     error;
+        VOP_ATTR_REMOVE(vp, name, xflags, NULL, error);
+        return -error;
+}
+STATIC int
+attr_generic_listadd(
+        attrnames_t             *prefix,
+        attrnames_t             *namesp,
+        void                    *data,
+        size_t                  size,
+        ssize_t                 *result)
+{
+        char                    *p = data + *result;
+        *result += prefix->attr_namelen;
+        *result += namesp->attr_namelen + 1;
+        if (!size)
+                return 0;
+        if (*result > size)
+                return -ERANGE;
+        strcpy(p, prefix->attr_name);
+        p += prefix->attr_namelen;
+        strcpy(p, namesp->attr_name);
+        p += namesp->attr_namelen + 1;
+        return 0;
+}
+STATIC int
+attr_system_list(
+        struct vnode            *vp,
+        void                    *data,
+        size_t                  size,
+        ssize_t                 *result)
+{
+        attrnames_t             *namesp;
+        int                     i, error = 0;
+        for (i = 0; i < ATTR_SYSCOUNT; i++) {
+                namesp = attr_system_names[i];
+                if (!namesp->attr_exists || !namesp->attr_exists(vp))
+                        continue;
+                error = attr_generic_listadd(&attr_system, namesp,
+                                                data, size, result);
+                if (error)
+                        break;
+        }
+        return error;
+}
+int
+attr_generic_list(
+        struct vnode *vp, void *data, size_t size, int xflags, ssize_t *result)
+{
+        attrlist_cursor_kern_t  cursor = { 0 };
+        int                     error;
+        VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
+        if (error > 0)
+                return -error;
+        *result = -error;
+        return attr_system_list(vp, data, size, result);
+}
+attrnames_t *
+attr_lookup_namespace(
+        char                    *name,
+        struct attrnames        **names,
+        int                     nnames)
+{
+        int                     i;
+        for (i = 0; i < nnames; i++)
+                if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen))
+                        return names[i];
+        return NULL;
+}
+/*
+ * Some checks to prevent people abusing EAs to get over quota:
+ * - Don't allow modifying user EAs on devices/symlinks;
+ * - Don't allow modifying user EAs if sticky bit set;
+ */
+STATIC int
+attr_user_capable(
+        struct vnode    *vp,
+        cred_t          *cred)
+{
+        struct inode    *inode = LINVFS_GET_IP(vp);
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                return -EPERM;
+        if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
+            !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
+            (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        return 0;
+}
+STATIC int
+attr_trusted_capable(
+        struct vnode    *vp,
+        cred_t          *cred)
+{
+        struct inode    *inode = LINVFS_GET_IP(vp);
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+                return -EPERM;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return 0;
+}
+STATIC int
+attr_secure_capable(
+        struct vnode    *vp,
+        cred_t          *cred)
+{
+        return -ENOSECURITY;
+}
+STATIC int
+attr_system_set(
+        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+{
+        attrnames_t     *namesp;
+        int             error;
+        if (xflags & ATTR_CREATE)
+                return -EINVAL;
+        namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
+        if (!namesp)
+                return -EOPNOTSUPP;
+        error = namesp->attr_set(vp, name, data, size, xflags);
+        if (!error)
+                error = vn_revalidate(vp);
+        return error;
+}
+STATIC int
+attr_system_get(
+        struct vnode *vp, char *name, void *data, size_t size, int xflags)
+{
+        attrnames_t     *namesp;
+        namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
+        if (!namesp)
+                return -EOPNOTSUPP;
+        return namesp->attr_get(vp, name, data, size, xflags);
+}
+STATIC int
+attr_system_remove(
+        struct vnode *vp, char *name, int xflags)
+{
+        attrnames_t     *namesp;
+        namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
+        if (!namesp)
+                return -EOPNOTSUPP;
+        return namesp->attr_remove(vp, name, xflags);
+}
+struct attrnames attr_system = {
+        .attr_name      = "system.",
+        .attr_namelen   = sizeof("system.") - 1,
+        .attr_flag      = ATTR_SYSTEM,
+        .attr_get       = attr_system_get,
+        .attr_set       = attr_system_set,
+        .attr_remove    = attr_system_remove,
+        .attr_capable   = (attrcapable_t)fs_noerr,
+};
+struct attrnames attr_trusted = {
+        .attr_name      = "trusted.",
+        .attr_namelen   = sizeof("trusted.") - 1,
+        .attr_flag      = ATTR_ROOT,
+        .attr_get       = attr_generic_get,
+        .attr_set       = attr_generic_set,
+        .attr_remove    = attr_generic_remove,
+        .attr_capable   = attr_trusted_capable,
+};
+struct attrnames attr_secure = {
+        .attr_name      = "security.",
+        .attr_namelen   = sizeof("security.") - 1,
+        .attr_flag      = ATTR_SECURE,
+        .attr_get       = attr_generic_get,
+        .attr_set       = attr_generic_set,
+        .attr_remove    = attr_generic_remove,
+        .attr_capable   = attr_secure_capable,
+};
+struct attrnames attr_user = {
+        .attr_name      = "user.",
+        .attr_namelen   = sizeof("user.") - 1,
+        .attr_get       = attr_generic_get,
+        .attr_set       = attr_generic_set,
+        .attr_remove    = attr_generic_remove,
+        .attr_capable   = attr_user_capable,
+};
+struct attrnames *attr_namespaces[] =
+        { &attr_system, &attr_trusted, &attr_secure, &attr_user };
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
new file mode 100644
index 000000000000..67cd0f5ac1a7
--- /dev/null
+++ b/fs/xfs/xfs_attr.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2000, 2002-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_H__
+#define __XFS_ATTR_H__
+/*
+ * xfs_attr.h
+ *
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+/*========================================================================
+ * External interfaces
+ *========================================================================*/
+struct cred;
+struct vnode;
+typedef int (*attrset_t)(struct vnode *, char *, void *, size_t, int);
+typedef int (*attrget_t)(struct vnode *, char *, void *, size_t, int);
+typedef int (*attrremove_t)(struct vnode *, char *, int);
+typedef int (*attrexists_t)(struct vnode *);
+typedef int (*attrcapable_t)(struct vnode *, struct cred *);
+typedef struct attrnames {
+        char *          attr_name;
+        unsigned int    attr_namelen;
+        unsigned int    attr_flag;
+        attrget_t       attr_get;
+        attrset_t       attr_set;
+        attrremove_t    attr_remove;
+        attrexists_t    attr_exists;
+        attrcapable_t   attr_capable;
+} attrnames_t;
+#define ATTR_NAMECOUNT  4
+extern struct attrnames attr_user;
+extern struct attrnames attr_secure;
+extern struct attrnames attr_system;
+extern struct attrnames attr_trusted;
+extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
+#define ATTR_SYSCOUNT   2
+extern struct attrnames posix_acl_access;
+extern struct attrnames posix_acl_default;
+extern struct attrnames *attr_system_names[ATTR_SYSCOUNT];
+extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
+extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *);
+#define ATTR_DONTFOLLOW 0x0001  /* -- unused, from IRIX -- */
+#define ATTR_ROOT       0x0002  /* use attrs in root (trusted) namespace */
+#define ATTR_TRUST      0x0004  /* -- unused, from IRIX -- */
+#define ATTR_SECURE     0x0008  /* use attrs in security namespace */
+#define ATTR_CREATE     0x0010  /* pure create: fail if attr already exists */
+#define ATTR_REPLACE    0x0020  /* pure set: fail if attr does not exist */
+#define ATTR_SYSTEM     0x0100  /* use attrs in system (pseudo) namespace */
+#define ATTR_KERNACCESS 0x0400  /* [kernel] iaccess, inode held io-locked */
+#define ATTR_KERNOTIME  0x1000  /* [kernel] don't update inode timestamps */
+#define ATTR_KERNOVAL   0x2000  /* [kernel] get attr size only, not value */
+#define ATTR_KERNAMELS  0x4000  /* [kernel] list attr names (simple list) */
+#define ATTR_KERNORMALS 0x0800  /* [kernel] normal attr list: user+secure */
+#define ATTR_KERNROOTLS 0x8000  /* [kernel] include root in the attr list */
+#define ATTR_KERNFULLS  (ATTR_KERNORMALS|ATTR_KERNROOTLS)
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call.  Larger
+ * sizes will result in an ERANGE return code.
+ */
+#define ATTR_MAX_VALUELEN       (64*1024)       /* max length of a value */
+/*
+ * Define how lists of attribute names are returned to the user from
+ * the attr_list() call.  A large, 32bit aligned, buffer is passed in
+ * along with its size.  We put an array of offsets at the top that each
+ * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
+ */
+typedef struct attrlist {
+        __s32   al_count;       /* number of entries in attrlist */
+        __s32   al_more;        /* T/F: more attrs (do call again) */
+        __s32   al_offset[1];   /* byte offsets of attrs [var-sized] */
+} attrlist_t;
+/*
+ * Show the interesting info about one attribute.  This is what the
+ * al_offset[i] entry points to.
+ */
+typedef struct attrlist_ent {   /* data from attr_list() */
+        __u32   a_valuelen;     /* number bytes in value of attr */
+        char    a_name[1];      /* attr name (NULL terminated) */
+} attrlist_ent_t;
+/*
+ * Given a pointer to the (char*) buffer containing the attr_list() result,
+ * and an index, return a pointer to the indicated attribute in the buffer.
+ */
+#define ATTR_ENTRY(buffer, index)               \
+        ((attrlist_ent_t *)                     \
+         &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
+/*
+ * Multi-attribute operation vector.
+ */
+typedef struct attr_multiop {
+        int     am_opcode;      /* operation to perform (ATTR_OP_GET, etc.) */
+        int     am_error;       /* [out arg] result of this sub-op (an errno) */
+        char    *am_attrname;   /* attribute name to work with */
+        char    *am_attrvalue;  /* [in/out arg] attribute value (raw bytes) */
+        int     am_length;      /* [in/out arg] length of value */
+        int     am_flags;       /* bitwise OR of attr API flags defined above */
+} attr_multiop_t;
+#define ATTR_OP_GET     1       /* return the indicated attr's value */
+#define ATTR_OP_SET     2       /* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE  3       /* remove the indicated attr */
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+typedef struct attrlist_cursor_kern {
+        __u32   hashval;        /* hash value of next entry to add */
+        __u32   blkno;          /* block containing entry (suggestion) */
+        __u32   offset;         /* offset in list of equal-hashvals */
+        __u16   pad1;           /* padding to match user-level */
+        __u8    pad2;           /* padding to match user-level */
+        __u8    initted;        /* T/F: cursor has been initialized */
+} attrlist_cursor_kern_t;
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+struct xfs_inode;
+struct attrlist_cursor_kern;
+struct xfs_da_args;
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_get(bhv_desc_t *, char *, char *, int *, int, struct cred *);
+int xfs_attr_set(bhv_desc_t *, char *, char *, int, int, struct cred *);
+int xfs_attr_remove(bhv_desc_t *, char *, int, struct cred *);
+int xfs_attr_list(bhv_desc_t *, char *, int, int,
+                         struct attrlist_cursor_kern *, struct cred *);
+int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_node_get(struct xfs_da_args *);
+int xfs_attr_leaf_get(struct xfs_da_args *);
+int xfs_attr_shortform_getvalue(struct xfs_da_args *);
+int xfs_attr_fetch(struct xfs_inode *, char *, int,
+                        char *, int *, int, struct cred *);
+#endif  /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
new file mode 100644
index 000000000000..b11256e58bf4
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -0,0 +1,3050 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_attr_leaf.c
+ *
+ * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+                                              int freemap_index);
+STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
+STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+                                                   xfs_da_state_blk_t *blk1,
+                                                   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+                                           xfs_da_state_blk_t *leaf_blk_1,
+                                           xfs_da_state_blk_t *leaf_blk_2,
+                                           int *number_entries_in_blk1,
+                                           int *number_usedbytes_in_blk1);
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
+                                         int src_start,
+                                         xfs_attr_leafblock_t *dst_leaf,
+                                         int dst_start, int move_count,
+                                         xfs_mount_t *mp);
+/*========================================================================
+ * External routines when dirsize < XFS_LITINO(mp).
+ *========================================================================*/
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+int
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+        xfs_attr_sf_hdr_t *hdr;
+        xfs_inode_t *dp;
+        xfs_ifork_t *ifp;
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        ifp = dp->i_afp;
+        ASSERT(ifp != NULL);
+        ASSERT(ifp->if_bytes == 0);
+        if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+                ifp->if_flags &= ~XFS_IFEXTENTS;        /* just in case */
+                dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+                ifp->if_flags |= XFS_IFINLINE;
+        } else {
+                ASSERT(ifp->if_flags & XFS_IFINLINE);
+        }
+        xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+        hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+        hdr->count = 0;
+        INT_SET(hdr->totsize, ARCH_CONVERT, sizeof(*hdr));
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+        return(0);
+}
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+int
+xfs_attr_shortform_add(xfs_da_args_t *args)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int i, offset, size;
+        xfs_inode_t *dp;
+        xfs_ifork_t *ifp;
+        dp = args->dp;
+        ifp = dp->i_afp;
+        ASSERT(ifp->if_flags & XFS_IFINLINE);
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                        continue;
+                if (((args->flags & ATTR_SECURE) != 0) !=
+                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
+                        continue;
+                if (((args->flags & ATTR_ROOT) != 0) !=
+                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+                        continue;
+                return(XFS_ERROR(EEXIST));
+        }
+        offset = (char *)sfe - (char *)sf;
+        size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+        xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+        sfe->namelen = args->namelen;
+        INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen);
+        sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
+                        ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
+        memcpy(sfe->nameval, args->name, args->namelen);
+        memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
+        INT_MOD(sf->hdr.count, ARCH_CONVERT, 1);
+        INT_MOD(sf->hdr.totsize, ARCH_CONVERT, size);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+        return(0);
+}
+/*
+ * Remove a name from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int base, size=0, end, totsize, i;
+        xfs_inode_t *dp;
+        /*
+         * Remove the attribute.
+         */
+        dp = args->dp;
+        base = sizeof(xfs_attr_sf_hdr_t);
+        sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+                                        base += size, i++) {
+                size = XFS_ATTR_SF_ENTSIZE(sfe);
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
+                        continue;
+                if (((args->flags & ATTR_SECURE) != 0) !=
+                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
+                        continue;
+                if (((args->flags & ATTR_ROOT) != 0) !=
+                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+                        continue;
+                break;
+        }
+        if (i == INT_GET(sf->hdr.count, ARCH_CONVERT))
+                return(XFS_ERROR(ENOATTR));
+        end = base + size;
+        totsize = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+        if (end != totsize) {
+                memmove(&((char *)sf)[base], &((char *)sf)[end],
+                                                        totsize - end);
+        }
+        INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
+        INT_MOD(sf->hdr.totsize, ARCH_CONVERT, -size);
+        xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+        return(0);
+}
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int i;
+        xfs_ifork_t *ifp;
+        ifp = args->dp->i_afp;
+        ASSERT(ifp->if_flags & XFS_IFINLINE);
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                        continue;
+                if (((args->flags & ATTR_SECURE) != 0) !=
+                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
+                        continue;
+                if (((args->flags & ATTR_ROOT) != 0) !=
+                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+                        continue;
+                return(XFS_ERROR(EEXIST));
+        }
+        return(XFS_ERROR(ENOATTR));
+}
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        int i;
+        ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+        sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+                if (sfe->namelen != args->namelen)
+                        continue;
+                if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
+                        continue;
+                if (((args->flags & ATTR_SECURE) != 0) !=
+                    ((sfe->flags & XFS_ATTR_SECURE) != 0))
+                        continue;
+                if (((args->flags & ATTR_ROOT) != 0) !=
+                    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+                        continue;
+                if (args->flags & ATTR_KERNOVAL) {
+                        args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+                        return(XFS_ERROR(EEXIST));
+                }
+                if (args->valuelen < INT_GET(sfe->valuelen, ARCH_CONVERT)) {
+                        args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+                        return(XFS_ERROR(ERANGE));
+                }
+                args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+                memcpy(args->value, &sfe->nameval[args->namelen],
+                                                    args->valuelen);
+                return(XFS_ERROR(EEXIST));
+        }
+        return(XFS_ERROR(ENOATTR));
+}
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+        xfs_inode_t *dp;
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        xfs_da_args_t nargs;
+        char *tmpbuffer;
+        int error, i, size;
+        xfs_dablk_t blkno;
+        xfs_dabuf_t *bp;
+        xfs_ifork_t *ifp;
+        dp = args->dp;
+        ifp = dp->i_afp;
+        sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+        size = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+        tmpbuffer = kmem_alloc(size, KM_SLEEP);
+        ASSERT(tmpbuffer != NULL);
+        memcpy(tmpbuffer, ifp->if_u1.if_data, size);
+        sf = (xfs_attr_shortform_t *)tmpbuffer;
+        xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+        bp = NULL;
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error) {
+                /*
+                 * If we hit an IO error middle of the transaction inside
+                 * grow_inode(), we may have inconsistent data. Bail out.
+                 */
+                if (error == EIO)
+                        goto out;
+                xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
+                memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
+                goto out;
+        }
+        ASSERT(blkno == 0);
+        error = xfs_attr_leaf_create(args, blkno, &bp);
+        if (error) {
+                error = xfs_da_shrink_inode(args, 0, bp);
+                bp = NULL;
+                if (error)
+                        goto out;
+                xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
+                memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
+                goto out;
+        }
+        memset((char *)&nargs, 0, sizeof(nargs));
+        nargs.dp = dp;
+        nargs.firstblock = args->firstblock;
+        nargs.flist = args->flist;
+        nargs.total = args->total;
+        nargs.whichfork = XFS_ATTR_FORK;
+        nargs.trans = args->trans;
+        nargs.oknoent = 1;
+        sfe = &sf->list[0];
+        for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+                nargs.name = (char *)sfe->nameval;
+                nargs.namelen = sfe->namelen;
+                nargs.value = (char *)&sfe->nameval[nargs.namelen];
+                nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+                nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
+                                                sfe->namelen);
+                nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
+                                ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
+                error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+                ASSERT(error == ENOATTR);
+                error = xfs_attr_leaf_add(bp, &nargs);
+                ASSERT(error != ENOSPC);
+                if (error)
+                        goto out;
+                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+        }
+        error = 0;
+out:
+        if(bp)
+                xfs_da_buf_done(bp);
+        kmem_free(tmpbuffer, size);
+        return(error);
+}
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+        xfs_attr_sf_sort_t *sa, *sb;
+        sa = (xfs_attr_sf_sort_t *)a;
+        sb = (xfs_attr_sf_sort_t *)b;
+        if (INT_GET(sa->hash, ARCH_CONVERT)
+                                < INT_GET(sb->hash, ARCH_CONVERT)) {
+                return(-1);
+        } else if (INT_GET(sa->hash, ARCH_CONVERT)
+                                > INT_GET(sb->hash, ARCH_CONVERT)) {
+                return(1);
+        } else {
+                return(sa->entno - sb->entno);
+        }
+}
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform atrtribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+        attrlist_cursor_kern_t *cursor;
+        xfs_attr_sf_sort_t *sbuf, *sbp;
+        xfs_attr_shortform_t *sf;
+        xfs_attr_sf_entry_t *sfe;
+        xfs_inode_t *dp;
+        int sbsize, nsbuf, count, i;
+        ASSERT(context != NULL);
+        dp = context->dp;
+        ASSERT(dp != NULL);
+        ASSERT(dp->i_afp != NULL);
+        sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+        ASSERT(sf != NULL);
+        if (!sf->hdr.count)
+                return(0);
+        cursor = context->cursor;
+        ASSERT(cursor != NULL);
+        xfs_attr_trace_l_c("sf start", context);
+        /*
+         * If the buffer is large enough, do not bother with sorting.
+         * Note the generous fudge factor of 16 overhead bytes per entry.
+         */
+        if ((dp->i_afp->if_bytes + INT_GET(sf->hdr.count, ARCH_CONVERT) * 16)
+                                                        < context->bufsize) {
+                for (i = 0, sfe = &sf->list[0];
+                                i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+                        attrnames_t     *namesp;
+                        if (((context->flags & ATTR_SECURE) != 0) !=
+                            ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
+                            !(context->flags & ATTR_KERNORMALS)) {
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+                                continue;
+                        }
+                        if (((context->flags & ATTR_ROOT) != 0) !=
+                            ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
+                            !(context->flags & ATTR_KERNROOTLS)) {
+                                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+                                continue;
+                        }
+                        namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure:
+                                ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted :
+                                  &attr_user);
+                        if (context->flags & ATTR_KERNOVAL) {
+                                ASSERT(context->flags & ATTR_KERNAMELS);
+                                context->count += namesp->attr_namelen +
+                                        INT_GET(sfe->namelen, ARCH_CONVERT) + 1;
+                        }
+                        else {
+                                if (xfs_attr_put_listent(context, namesp,
+                                                   (char *)sfe->nameval,
+                                                   (int)sfe->namelen,
+                                                   (int)INT_GET(sfe->valuelen,
+                                                                ARCH_CONVERT)))
+                                        break;
+                        }
+                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+                }
+                xfs_attr_trace_l_c("sf big-gulp", context);
+                return(0);
+        }
+        /*
+         * It didn't all fit, so we have to sort everything on hashval.
+         */
+        sbsize = INT_GET(sf->hdr.count, ARCH_CONVERT) * sizeof(*sbuf);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        /*
+         * Scan the attribute list for the rest of the entries, storing
+         * the relevant info from only those that match into a buffer.
+         */
+        nsbuf = 0;
+        for (i = 0, sfe = &sf->list[0];
+                        i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+                if (unlikely(
+                    ((char *)sfe < (char *)sf) ||
+                    ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
+                        XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
+                                             XFS_ERRLEVEL_LOW,
+                                             context->dp->i_mount, sfe);
+                        xfs_attr_trace_l_c("sf corrupted", context);
+                        kmem_free(sbuf, sbsize);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                if (((context->flags & ATTR_SECURE) != 0) !=
+                    ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
+                    !(context->flags & ATTR_KERNORMALS)) {
+                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+                        continue;
+                }
+                if (((context->flags & ATTR_ROOT) != 0) !=
+                    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
+                    !(context->flags & ATTR_KERNROOTLS)) {
+                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+                        continue;
+                }
+                sbp->entno = i;
+                INT_SET(sbp->hash, ARCH_CONVERT,
+                        xfs_da_hashname((char *)sfe->nameval, sfe->namelen));
+                sbp->name = (char *)sfe->nameval;
+                sbp->namelen = sfe->namelen;
+                /* These are bytes, and both on-disk, don't endian-flip */
+                sbp->valuelen = sfe->valuelen;
+                sbp->flags = sfe->flags;
+                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+                sbp++;
+                nsbuf++;
+        }
+        /*
+         * Sort the entries on hash then entno.
+         */
+        qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+        /*
+         * Re-find our place IN THE SORTED LIST.
+         */
+        count = 0;
+        cursor->initted = 1;
+        cursor->blkno = 0;
+        for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+                if (INT_GET(sbp->hash, ARCH_CONVERT) == cursor->hashval) {
+                        if (cursor->offset == count) {
+                                break;
+                        }
+                        count++;
+                } else if (INT_GET(sbp->hash, ARCH_CONVERT) > cursor->hashval) {
+                        break;
+                }
+        }
+        if (i == nsbuf) {
+                kmem_free(sbuf, sbsize);
+                xfs_attr_trace_l_c("blk end", context);
+                return(0);
+        }
+        /*
+         * Loop putting entries into the user buffer.
+         */
+        for ( ; i < nsbuf; i++, sbp++) {
+                attrnames_t     *namesp;
+                namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure :
+                        ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted :
+                          &attr_user);
+                if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) {
+                        cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT);
+                        cursor->offset = 0;
+                }
+                if (context->flags & ATTR_KERNOVAL) {
+                        ASSERT(context->flags & ATTR_KERNAMELS);
+                        context->count += namesp->attr_namelen +
+                                                sbp->namelen + 1;
+                } else {
+                        if (xfs_attr_put_listent(context, namesp,
+                                        sbp->name, sbp->namelen,
+                                        INT_GET(sbp->valuelen, ARCH_CONVERT)))
+                                break;
+                }
+                cursor->offset++;
+        }
+        kmem_free(sbuf, sbsize);
+        xfs_attr_trace_l_c("sf E-O-F", context);
+        return(0);
+}
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_local_t *name_loc;
+        int bytes, i;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        entry = &leaf->entries[0];
+        bytes = sizeof(struct xfs_attr_sf_hdr);
+        for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+                if (entry->flags & XFS_ATTR_INCOMPLETE)
+                        continue;               /* don't copy partial entries */
+                if (!(entry->flags & XFS_ATTR_LOCAL))
+                        return(0);
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+                if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+                        return(0);
+                if (INT_GET(name_loc->valuelen, ARCH_CONVERT) >= XFS_ATTR_SF_ENTSIZE_MAX)
+                        return(0);
+                bytes += sizeof(struct xfs_attr_sf_entry)-1
+                                + name_loc->namelen
+                                + INT_GET(name_loc->valuelen, ARCH_CONVERT);
+        }
+        return( bytes < XFS_IFORK_ASIZE(dp) );
+}
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_local_t *name_loc;
+        xfs_da_args_t nargs;
+        xfs_inode_t *dp;
+        char *tmpbuffer;
+        int error, i;
+        dp = args->dp;
+        tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+        ASSERT(tmpbuffer != NULL);
+        ASSERT(bp != NULL);
+        memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
+        leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
+        /*
+         * Clean out the prior contents of the attribute list.
+         */
+        error = xfs_da_shrink_inode(args, 0, bp);
+        if (error)
+                goto out;
+        error = xfs_attr_shortform_create(args);
+        if (error)
+                goto out;
+        /*
+         * Copy the attributes
+         */
+        memset((char *)&nargs, 0, sizeof(nargs));
+        nargs.dp = dp;
+        nargs.firstblock = args->firstblock;
+        nargs.flist = args->flist;
+        nargs.total = args->total;
+        nargs.whichfork = XFS_ATTR_FORK;
+        nargs.trans = args->trans;
+        nargs.oknoent = 1;
+        entry = &leaf->entries[0];
+        for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+                if (entry->flags & XFS_ATTR_INCOMPLETE)
+                        continue;       /* don't copy partial entries */
+                if (!entry->nameidx)
+                        continue;
+                ASSERT(entry->flags & XFS_ATTR_LOCAL);
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+                nargs.name = (char *)name_loc->nameval;
+                nargs.namelen = name_loc->namelen;
+                nargs.value = (char *)&name_loc->nameval[nargs.namelen];
+                nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
+                nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+                nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
+                              ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
+                xfs_attr_shortform_add(&nargs);
+        }
+        error = 0;
+out:
+        kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+        return(error);
+}
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr_leaf_to_node(xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_da_intnode_t *node;
+        xfs_inode_t *dp;
+        xfs_dabuf_t *bp1, *bp2;
+        xfs_dablk_t blkno;
+        int error;
+        dp = args->dp;
+        bp1 = bp2 = NULL;
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error)
+                goto out;
+        error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+                                             XFS_ATTR_FORK);
+        if (error)
+                goto out;
+        ASSERT(bp1 != NULL);
+        bp2 = NULL;
+        error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
+                                            XFS_ATTR_FORK);
+        if (error)
+                goto out;
+        ASSERT(bp2 != NULL);
+        memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
+        xfs_da_buf_done(bp1);
+        bp1 = NULL;
+        xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+        /*
+         * Set up the new root node.
+         */
+        error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+        if (error)
+                goto out;
+        node = bp1->data;
+        leaf = bp2->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        /* both on-disk, don't endian-flip twice */
+        node->btree[0].hashval =
+                leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval;
+        INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
+        INT_SET(node->hdr.count, ARCH_CONVERT, 1);
+        xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+        error = 0;
+out:
+        if (bp1)
+                xfs_da_buf_done(bp1);
+        if (bp2)
+                xfs_da_buf_done(bp2);
+        return(error);
+}
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+int
+xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_hdr_t *hdr;
+        xfs_inode_t *dp;
+        xfs_dabuf_t *bp;
+        int error;
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+                                            XFS_ATTR_FORK);
+        if (error)
+                return(error);
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
+        hdr = &leaf->hdr;
+        INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_ATTR_LEAF_MAGIC);
+        INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
+        if (!hdr->firstused) {
+                INT_SET(hdr->firstused, ARCH_CONVERT,
+                        XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
+        }
+        INT_SET(hdr->freemap[0].base, ARCH_CONVERT,
+                                                sizeof(xfs_attr_leaf_hdr_t));
+        INT_SET(hdr->freemap[0].size, ARCH_CONVERT,
+                                          INT_GET(hdr->firstused, ARCH_CONVERT)
+                                        - INT_GET(hdr->freemap[0].base,
+                                                                ARCH_CONVERT));
+        xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+        *bpp = bp;
+        return(0);
+}
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+                                   xfs_da_state_blk_t *newblk)
+{
+        xfs_dablk_t blkno;
+        int error;
+        /*
+         * Allocate space for a new leaf node.
+         */
+        ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+        error = xfs_da_grow_inode(state->args, &blkno);
+        if (error)
+                return(error);
+        error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+        if (error)
+                return(error);
+        newblk->blkno = blkno;
+        newblk->magic = XFS_ATTR_LEAF_MAGIC;
+        /*
+         * Rebalance the entries across the two leaves.
+         * NOTE: rebalance() currently depends on the 2nd block being empty.
+         */
+        xfs_attr_leaf_rebalance(state, oldblk, newblk);
+        error = xfs_da_blk_link(state, oldblk, newblk);
+        if (error)
+                return(error);
+        /*
+         * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+         * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+         * "new" attrs info.  Will need the "old" info to remove it later.
+         *
+         * Insert the "new" entry in the correct block.
+         */
+        if (state->inleaf)
+                error = xfs_attr_leaf_add(oldblk->bp, state->args);
+        else
+                error = xfs_attr_leaf_add(newblk->bp, state->args);
+        /*
+         * Update last hashval in each block since we added the name.
+         */
+        oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+        newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+        return(error);
+}
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_hdr_t *hdr;
+        xfs_attr_leaf_map_t *map;
+        int tablesize, entsize, sum, tmp, i;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT((args->index >= 0)
+                && (args->index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+        hdr = &leaf->hdr;
+        entsize = xfs_attr_leaf_newentsize(args,
+                           args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+        /*
+         * Search through freemap for first-fit on new name length.
+         * (may need to figure in size of entry struct too)
+         */
+        tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1)
+                                        * sizeof(xfs_attr_leaf_entry_t)
+                                        + sizeof(xfs_attr_leaf_hdr_t);
+        map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
+        for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+                if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
+                        sum += INT_GET(map->size, ARCH_CONVERT);
+                        continue;
+                }
+                if (!map->size)
+                        continue;       /* no space in this map */
+                tmp = entsize;
+                if (INT_GET(map->base, ARCH_CONVERT)
+                                < INT_GET(hdr->firstused, ARCH_CONVERT))
+                        tmp += sizeof(xfs_attr_leaf_entry_t);
+                if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
+                        tmp = xfs_attr_leaf_add_work(bp, args, i);
+                        return(tmp);
+                }
+                sum += INT_GET(map->size, ARCH_CONVERT);
+        }
+        /*
+         * If there are no holes in the address space of the block,
+         * and we don't have enough freespace, then compaction will do us
+         * no good and we should just give up.
+         */
+        if (!hdr->holes && (sum < entsize))
+                return(XFS_ERROR(ENOSPC));
+        /*
+         * Compact the entries to coalesce free space.
+         * This may change the hdr->count via dropping INCOMPLETE entries.
+         */
+        xfs_attr_leaf_compact(args->trans, bp);
+        /*
+         * After compaction, the block is guaranteed to have only one
+         * free region, in freemap[0].  If it is not big enough, give up.
+         */
+        if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT)
+                                < (entsize + sizeof(xfs_attr_leaf_entry_t)))
+                return(XFS_ERROR(ENOSPC));
+        return(xfs_attr_leaf_add_work(bp, args, 0));
+}
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_hdr_t *hdr;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_local_t *name_loc;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        xfs_attr_leaf_map_t *map;
+        xfs_mount_t *mp;
+        int tmp, i;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        hdr = &leaf->hdr;
+        ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
+        ASSERT((args->index >= 0)
+                && (args->index <= INT_GET(hdr->count, ARCH_CONVERT)));
+        /*
+         * Force open some space in the entry array and fill it in.
+         */
+        entry = &leaf->entries[args->index];
+        if (args->index < INT_GET(hdr->count, ARCH_CONVERT)) {
+                tmp  = INT_GET(hdr->count, ARCH_CONVERT) - args->index;
+                tmp *= sizeof(xfs_attr_leaf_entry_t);
+                memmove((char *)(entry+1), (char *)entry, tmp);
+                xfs_da_log_buf(args->trans, bp,
+                    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+        }
+        INT_MOD(hdr->count, ARCH_CONVERT, 1);
+        /*
+         * Allocate space for the new string (at the end of the run).
+         */
+        map = &hdr->freemap[mapindex];
+        mp = args->trans->t_mountp;
+        ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+        ASSERT((INT_GET(map->base, ARCH_CONVERT) & 0x3) == 0);
+        ASSERT(INT_GET(map->size, ARCH_CONVERT)
+                                >= xfs_attr_leaf_newentsize(args,
+                                             mp->m_sb.sb_blocksize, NULL));
+        ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+        ASSERT((INT_GET(map->size, ARCH_CONVERT) & 0x3) == 0);
+        INT_MOD(map->size, ARCH_CONVERT,
+                -xfs_attr_leaf_newentsize(args, mp->m_sb.sb_blocksize, &tmp));
+        INT_SET(entry->nameidx, ARCH_CONVERT,
+                                        INT_GET(map->base, ARCH_CONVERT)
+                                      + INT_GET(map->size, ARCH_CONVERT));
+        INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+        entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+        entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
+                        ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
+        if (args->rename) {
+                entry->flags |= XFS_ATTR_INCOMPLETE;
+                if ((args->blkno2 == args->blkno) &&
+                    (args->index2 <= args->index)) {
+                        args->index2++;
+                }
+        }
+        xfs_da_log_buf(args->trans, bp,
+                          XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+        ASSERT((args->index == 0) || (INT_GET(entry->hashval, ARCH_CONVERT)
+                                                >= INT_GET((entry-1)->hashval,
+                                                            ARCH_CONVERT)));
+        ASSERT((args->index == INT_GET(hdr->count, ARCH_CONVERT)-1) ||
+               (INT_GET(entry->hashval, ARCH_CONVERT)
+                            <= (INT_GET((entry+1)->hashval, ARCH_CONVERT))));
+        /*
+         * Copy the attribute name and value into the new space.
+         *
+         * For "remote" attribute values, simply note that we need to
+         * allocate space for the "remote" value.  We can't actually
+         * allocate the extents in this transaction, and we can't decide
+         * which blocks they should be as we might allocate more blocks
+         * as part of this transaction (a split operation for example).
+         */
+        if (entry->flags & XFS_ATTR_LOCAL) {
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+                name_loc->namelen = args->namelen;
+                INT_SET(name_loc->valuelen, ARCH_CONVERT, args->valuelen);
+                memcpy((char *)name_loc->nameval, args->name, args->namelen);
+                memcpy((char *)&name_loc->nameval[args->namelen], args->value,
+                                   INT_GET(name_loc->valuelen, ARCH_CONVERT));
+        } else {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                name_rmt->namelen = args->namelen;
+                memcpy((char *)name_rmt->name, args->name, args->namelen);
+                entry->flags |= XFS_ATTR_INCOMPLETE;
+                /* just in case */
+                name_rmt->valuelen = 0;
+                name_rmt->valueblk = 0;
+                args->rmtblkno = 1;
+                args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+        }
+        xfs_da_log_buf(args->trans, bp,
+             XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+                                   xfs_attr_leaf_entsize(leaf, args->index)));
+        /*
+         * Update the control info for this leaf node
+         */
+        if (INT_GET(entry->nameidx, ARCH_CONVERT)
+                                < INT_GET(hdr->firstused, ARCH_CONVERT)) {
+                /* both on-disk, don't endian-flip twice */
+                hdr->firstused = entry->nameidx;
+        }
+        ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
+                                >= ((INT_GET(hdr->count, ARCH_CONVERT)
+                                        * sizeof(*entry))+sizeof(*hdr)));
+        tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1)
+                                        * sizeof(xfs_attr_leaf_entry_t)
+                                        + sizeof(xfs_attr_leaf_hdr_t);
+        map = &hdr->freemap[0];
+        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+                if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
+                        INT_MOD(map->base, ARCH_CONVERT,
+                                        sizeof(xfs_attr_leaf_entry_t));
+                        INT_MOD(map->size, ARCH_CONVERT,
+                                        -sizeof(xfs_attr_leaf_entry_t));
+                }
+        }
+        INT_MOD(hdr->usedbytes, ARCH_CONVERT,
+                                xfs_attr_leaf_entsize(leaf, args->index));
+        xfs_da_log_buf(args->trans, bp,
+                XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+        return(0);
+}
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
+{
+        xfs_attr_leafblock_t *leaf_s, *leaf_d;
+        xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+        xfs_mount_t *mp;
+        char *tmpbuffer;
+        mp = trans->t_mountp;
+        tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
+        ASSERT(tmpbuffer != NULL);
+        memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp));
+        memset(bp->data, 0, XFS_LBSIZE(mp));
+        /*
+         * Copy basic information
+         */
+        leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
+        leaf_d = bp->data;
+        hdr_s = &leaf_s->hdr;
+        hdr_d = &leaf_d->hdr;
+        hdr_d->info = hdr_s->info;      /* struct copy */
+        INT_SET(hdr_d->firstused, ARCH_CONVERT, XFS_LBSIZE(mp));
+        /* handle truncation gracefully */
+        if (!hdr_d->firstused) {
+                INT_SET(hdr_d->firstused, ARCH_CONVERT,
+                                XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
+        }
+        hdr_d->usedbytes = 0;
+        hdr_d->count = 0;
+        hdr_d->holes = 0;
+        INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
+                                        sizeof(xfs_attr_leaf_hdr_t));
+        INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
+                                INT_GET(hdr_d->firstused, ARCH_CONVERT)
+                              - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+        /*
+         * Copy all entry's in the same (sorted) order,
+         * but allocate name/value pairs packed and in sequence.
+         */
+        xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
+                                (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+        xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+        kmem_free(tmpbuffer, XFS_LBSIZE(mp));
+}
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.  Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+                                       xfs_da_state_blk_t *blk2)
+{
+        xfs_da_args_t *args;
+        xfs_da_state_blk_t *tmp_blk;
+        xfs_attr_leafblock_t *leaf1, *leaf2;
+        xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+        int count, totallen, max, space, swap;
+        /*
+         * Set up environment.
+         */
+        ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+        leaf1 = blk1->bp->data;
+        leaf2 = blk2->bp->data;
+        ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        args = state->args;
+        /*
+         * Check ordering of blocks, reverse if it makes things simpler.
+         *
+         * NOTE: Given that all (current) callers pass in an empty
+         * second block, this code should never set "swap".
+         */
+        swap = 0;
+        if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+                tmp_blk = blk1;
+                blk1 = blk2;
+                blk2 = tmp_blk;
+                leaf1 = blk1->bp->data;
+                leaf2 = blk2->bp->data;
+                swap = 1;
+        }
+        hdr1 = &leaf1->hdr;
+        hdr2 = &leaf2->hdr;
+        /*
+         * Examine entries until we reduce the absolute difference in
+         * byte usage between the two blocks to a minimum.  Then get
+         * the direction to copy and the number of elements to move.
+         *
+         * "inleaf" is true if the new entry should be inserted into blk1.
+         * If "swap" is also true, then reverse the sense of "inleaf".
+         */
+        state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
+                                                            &count, &totallen);
+        if (swap)
+                state->inleaf = !state->inleaf;
+        /*
+         * Move any entries required from leaf to leaf:
+         */
+        if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+                /*
+                 * Figure the total bytes to be added to the destination leaf.
+                 */
+                /* number entries being moved */
+                count = INT_GET(hdr1->count, ARCH_CONVERT) - count;
+                space  = INT_GET(hdr1->usedbytes, ARCH_CONVERT) - totallen;
+                space += count * sizeof(xfs_attr_leaf_entry_t);
+                /*
+                 * leaf2 is the destination, compact it if it looks tight.
+                 */
+                max  = INT_GET(hdr2->firstused, ARCH_CONVERT)
+                                                - sizeof(xfs_attr_leaf_hdr_t);
+                max -= INT_GET(hdr2->count, ARCH_CONVERT)
+                                        * sizeof(xfs_attr_leaf_entry_t);
+                if (space > max) {
+                        xfs_attr_leaf_compact(args->trans, blk2->bp);
+                }
+                /*
+                 * Move high entries from leaf1 to low end of leaf2.
+                 */
+                xfs_attr_leaf_moveents(leaf1,
+                                INT_GET(hdr1->count, ARCH_CONVERT)-count,
+                                leaf2, 0, count, state->mp);
+                xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+                xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+        } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+                /*
+                 * I assert that since all callers pass in an empty
+                 * second buffer, this code should never execute.
+                 */
+                /*
+                 * Figure the total bytes to be added to the destination leaf.
+                 */
+                /* number entries being moved */
+                count -= INT_GET(hdr1->count, ARCH_CONVERT);
+                space  = totallen - INT_GET(hdr1->usedbytes, ARCH_CONVERT);
+                space += count * sizeof(xfs_attr_leaf_entry_t);
+                /*
+                 * leaf1 is the destination, compact it if it looks tight.
+                 */
+                max  = INT_GET(hdr1->firstused, ARCH_CONVERT)
+                                                - sizeof(xfs_attr_leaf_hdr_t);
+                max -= INT_GET(hdr1->count, ARCH_CONVERT)
+                                        * sizeof(xfs_attr_leaf_entry_t);
+                if (space > max) {
+                        xfs_attr_leaf_compact(args->trans, blk1->bp);
+                }
+                /*
+                 * Move low entries from leaf2 to high end of leaf1.
+                 */
+                xfs_attr_leaf_moveents(leaf2, 0, leaf1,
+                                (int)INT_GET(hdr1->count, ARCH_CONVERT), count,
+                                state->mp);
+                xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+                xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+        }
+        /*
+         * Copy out last hashval in each block for B-tree code.
+         */
+        blk1->hashval =
+            INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
+                                    ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
+        blk2->hashval =
+            INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
+                                    ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
+        /*
+         * Adjust the expected index for insertion.
+         * NOTE: this code depends on the (current) situation that the
+         * second block was originally empty.
+         *
+         * If the insertion point moved to the 2nd block, we must adjust
+         * the index.  We must also track the entry just following the
+         * new entry for use in an "atomic rename" operation, that entry
+         * is always the "old" entry and the "new" entry is what we are
+         * inserting.  The index/blkno fields refer to the "old" entry,
+         * while the index2/blkno2 fields refer to the "new" entry.
+         */
+        if (blk1->index > INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+                ASSERT(state->inleaf == 0);
+                blk2->index = blk1->index
+                                - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+                args->index = args->index2 = blk2->index;
+                args->blkno = args->blkno2 = blk2->blkno;
+        } else if (blk1->index == INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+                if (state->inleaf) {
+                        args->index = blk1->index;
+                        args->blkno = blk1->blkno;
+                        args->index2 = 0;
+                        args->blkno2 = blk2->blkno;
+                } else {
+                        blk2->index = blk1->index
+                                    - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+                        args->index = args->index2 = blk2->index;
+                        args->blkno = args->blkno2 = blk2->blkno;
+                }
+        } else {
+                ASSERT(state->inleaf == 1);
+                args->index = args->index2 = blk1->index;
+                args->blkno = args->blkno2 = blk1->blkno;
+        }
+}
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+                                    xfs_da_state_blk_t *blk1,
+                                    xfs_da_state_blk_t *blk2,
+                                    int *countarg, int *usedbytesarg)
+{
+        xfs_attr_leafblock_t *leaf1, *leaf2;
+        xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+        xfs_attr_leaf_entry_t *entry;
+        int count, max, index, totallen, half;
+        int lastdelta, foundit, tmp;
+        /*
+         * Set up environment.
+         */
+        leaf1 = blk1->bp->data;
+        leaf2 = blk2->bp->data;
+        hdr1 = &leaf1->hdr;
+        hdr2 = &leaf2->hdr;
+        foundit = 0;
+        totallen = 0;
+        /*
+         * Examine entries until we reduce the absolute difference in
+         * byte usage between the two blocks to a minimum.
+         */
+        max = INT_GET(hdr1->count, ARCH_CONVERT)
+                        + INT_GET(hdr2->count, ARCH_CONVERT);
+        half  = (max+1) * sizeof(*entry);
+        half += INT_GET(hdr1->usedbytes, ARCH_CONVERT)
+                                + INT_GET(hdr2->usedbytes, ARCH_CONVERT)
+                                + xfs_attr_leaf_newentsize(state->args,
+                                                     state->blocksize, NULL);
+        half /= 2;
+        lastdelta = state->blocksize;
+        entry = &leaf1->entries[0];
+        for (count = index = 0; count < max; entry++, index++, count++) {
+#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
+                /*
+                 * The new entry is in the first block, account for it.
+                 */
+                if (count == blk1->index) {
+                        tmp = totallen + sizeof(*entry) +
+                                xfs_attr_leaf_newentsize(state->args,
+                                                         state->blocksize,
+                                                         NULL);
+                        if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+                                break;
+                        lastdelta = XFS_ATTR_ABS(half - tmp);
+                        totallen = tmp;
+                        foundit = 1;
+                }
+                /*
+                 * Wrap around into the second block if necessary.
+                 */
+                if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+                        leaf1 = leaf2;
+                        entry = &leaf1->entries[0];
+                        index = 0;
+                }
+                /*
+                 * Figure out if next leaf entry would be too much.
+                 */
+                tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+                                                                        index);
+                if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+                        break;
+                lastdelta = XFS_ATTR_ABS(half - tmp);
+                totallen = tmp;
+#undef XFS_ATTR_ABS
+        }
+        /*
+         * Calculate the number of usedbytes that will end up in lower block.
+         * If new entry not in lower block, fix up the count.
+         */
+        totallen -= count * sizeof(*entry);
+        if (foundit) {
+                totallen -= sizeof(*entry) +
+                                xfs_attr_leaf_newentsize(state->args,
+                                                         state->blocksize,
+                                                         NULL);
+        }
+        *countarg = count;
+        *usedbytesarg = totallen;
+        return(foundit);
+}
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_da_state_blk_t *blk;
+        xfs_da_blkinfo_t *info;
+        int count, bytes, forward, error, retval, i;
+        xfs_dablk_t blkno;
+        xfs_dabuf_t *bp;
+        /*
+         * Check for the degenerate case of the block being over 50% full.
+         * If so, it's not worth even looking to see if we might be able
+         * to coalesce with a sibling.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        info = blk->bp->data;
+        ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+        leaf = (xfs_attr_leafblock_t *)info;
+        count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+        bytes = sizeof(xfs_attr_leaf_hdr_t) +
+                count * sizeof(xfs_attr_leaf_entry_t) +
+                INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+        if (bytes > (state->blocksize >> 1)) {
+                *action = 0;    /* blk over 50%, don't try to join */
+                return(0);
+        }
+        /*
+         * Check for the degenerate case of the block being empty.
+         * If the block is empty, we'll simply delete it, no need to
+         * coalesce it with a sibling block.  We choose (aribtrarily)
+         * to merge with the forward block unless it is NULL.
+         */
+        if (count == 0) {
+                /*
+                 * Make altpath point to the block we want to keep and
+                 * path point to the block we want to drop (this one).
+                 */
+                forward = info->forw;
+                memcpy(&state->altpath, &state->path, sizeof(state->path));
+                error = xfs_da_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+                if (error)
+                        return(error);
+                if (retval) {
+                        *action = 0;
+                } else {
+                        *action = 2;
+                }
+                return(0);
+        }
+        /*
+         * Examine each sibling block to see if we can coalesce with
+         * at least 25% free space to spare.  We need to figure out
+         * whether to merge with the forward or the backward block.
+         * We prefer coalescing with the lower numbered sibling so as
+         * to shrink an attribute list over time.
+         */
+        /* start with smaller blk num */
+        forward = (INT_GET(info->forw, ARCH_CONVERT)
+                                        < INT_GET(info->back, ARCH_CONVERT));
+        for (i = 0; i < 2; forward = !forward, i++) {
+                if (forward)
+                        blkno = INT_GET(info->forw, ARCH_CONVERT);
+                else
+                        blkno = INT_GET(info->back, ARCH_CONVERT);
+                if (blkno == 0)
+                        continue;
+                error = xfs_da_read_buf(state->args->trans, state->args->dp,
+                                        blkno, -1, &bp, XFS_ATTR_FORK);
+                if (error)
+                        return(error);
+                ASSERT(bp != NULL);
+                leaf = (xfs_attr_leafblock_t *)info;
+                count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                bytes  = state->blocksize - (state->blocksize>>2);
+                bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+                leaf = bp->data;
+                ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+                count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+                bytes -= count * sizeof(xfs_attr_leaf_entry_t);
+                bytes -= sizeof(xfs_attr_leaf_hdr_t);
+                xfs_da_brelse(state->args->trans, bp);
+                if (bytes >= 0)
+                        break;  /* fits with at least 25% to spare */
+        }
+        if (i >= 2) {
+                *action = 0;
+                return(0);
+        }
+        /*
+         * Make altpath point to the block we want to keep (the lower
+         * numbered block) and path point to the block we want to drop.
+         */
+        memcpy(&state->altpath, &state->path, sizeof(state->path));
+        if (blkno < blk->blkno) {
+                error = xfs_da_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+        } else {
+                error = xfs_da_path_shift(state, &state->path, forward,
+                                                 0, &retval);
+        }
+        if (error)
+                return(error);
+        if (retval) {
+                *action = 0;
+        } else {
+                *action = 1;
+        }
+        return(0);
+}
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_hdr_t *hdr;
+        xfs_attr_leaf_map_t *map;
+        xfs_attr_leaf_entry_t *entry;
+        int before, after, smallest, entsize;
+        int tablesize, tmp, i;
+        xfs_mount_t *mp;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        hdr = &leaf->hdr;
+        mp = args->trans->t_mountp;
+        ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0)
+                && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+        ASSERT((args->index >= 0)
+                && (args->index < INT_GET(hdr->count, ARCH_CONVERT)));
+        ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
+                                >= ((INT_GET(hdr->count, ARCH_CONVERT)
+                                        * sizeof(*entry))+sizeof(*hdr)));
+        entry = &leaf->entries[args->index];
+        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+                                >= INT_GET(hdr->firstused, ARCH_CONVERT));
+        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+        /*
+         * Scan through free region table:
+         *    check for adjacency of free'd entry with an existing one,
+         *    find smallest free region in case we need to replace it,
+         *    adjust any map that borders the entry table,
+         */
+        tablesize = INT_GET(hdr->count, ARCH_CONVERT)
+                                        * sizeof(xfs_attr_leaf_entry_t)
+                                        + sizeof(xfs_attr_leaf_hdr_t);
+        map = &hdr->freemap[0];
+        tmp = INT_GET(map->size, ARCH_CONVERT);
+        before = after = -1;
+        smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+        entsize = xfs_attr_leaf_entsize(leaf, args->index);
+        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+                ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+                ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+                if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
+                        INT_MOD(map->base, ARCH_CONVERT,
+                                        -sizeof(xfs_attr_leaf_entry_t));
+                        INT_MOD(map->size, ARCH_CONVERT,
+                                        sizeof(xfs_attr_leaf_entry_t));
+                }
+                if ((INT_GET(map->base, ARCH_CONVERT)
+                                        + INT_GET(map->size, ARCH_CONVERT))
+                                == INT_GET(entry->nameidx, ARCH_CONVERT)) {
+                        before = i;
+                } else if (INT_GET(map->base, ARCH_CONVERT)
+                        == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+                        after = i;
+                } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
+                        tmp = INT_GET(map->size, ARCH_CONVERT);
+                        smallest = i;
+                }
+        }
+        /*
+         * Coalesce adjacent freemap regions,
+         * or replace the smallest region.
+         */
+        if ((before >= 0) || (after >= 0)) {
+                if ((before >= 0) && (after >= 0)) {
+                        map = &hdr->freemap[before];
+                        INT_MOD(map->size, ARCH_CONVERT, entsize);
+                        INT_MOD(map->size, ARCH_CONVERT,
+                                INT_GET(hdr->freemap[after].size,
+                                                        ARCH_CONVERT));
+                        hdr->freemap[after].base = 0;
+                        hdr->freemap[after].size = 0;
+                } else if (before >= 0) {
+                        map = &hdr->freemap[before];
+                        INT_MOD(map->size, ARCH_CONVERT, entsize);
+                } else {
+                        map = &hdr->freemap[after];
+                        /* both on-disk, don't endian flip twice */
+                        map->base = entry->nameidx;
+                        INT_MOD(map->size, ARCH_CONVERT, entsize);
+                }
+        } else {
+                /*
+                 * Replace smallest region (if it is smaller than free'd entry)
+                 */
+                map = &hdr->freemap[smallest];
+                if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
+                        INT_SET(map->base, ARCH_CONVERT,
+                                        INT_GET(entry->nameidx, ARCH_CONVERT));
+                        INT_SET(map->size, ARCH_CONVERT, entsize);
+                }
+        }
+        /*
+         * Did we remove the first entry?
+         */
+        if (INT_GET(entry->nameidx, ARCH_CONVERT)
+                                == INT_GET(hdr->firstused, ARCH_CONVERT))
+                smallest = 1;
+        else
+                smallest = 0;
+        /*
+         * Compress the remaining entries and zero out the removed stuff.
+         */
+        memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize);
+        INT_MOD(hdr->usedbytes, ARCH_CONVERT, -entsize);
+        xfs_da_log_buf(args->trans, bp,
+             XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+                                   entsize));
+        tmp = (INT_GET(hdr->count, ARCH_CONVERT) - args->index)
+                                        * sizeof(xfs_attr_leaf_entry_t);
+        memmove((char *)entry, (char *)(entry+1), tmp);
+        INT_MOD(hdr->count, ARCH_CONVERT, -1);
+        xfs_da_log_buf(args->trans, bp,
+            XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+        entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
+        memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t));
+        /*
+         * If we removed the first entry, re-find the first used byte
+         * in the name area.  Note that if the entry was the "firstused",
+         * then we don't have a "hole" in our block resulting from
+         * removing the name.
+         */
+        if (smallest) {
+                tmp = XFS_LBSIZE(mp);
+                entry = &leaf->entries[0];
+                for (i = INT_GET(hdr->count, ARCH_CONVERT)-1;
+                                                i >= 0; entry++, i--) {
+                        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+                                >= INT_GET(hdr->firstused, ARCH_CONVERT));
+                        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+                                                        < XFS_LBSIZE(mp));
+                        if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
+                                tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
+                }
+                INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
+                if (!hdr->firstused) {
+                        INT_SET(hdr->firstused, ARCH_CONVERT,
+                                        tmp - XFS_ATTR_LEAF_NAME_ALIGN);
+                }
+        } else {
+                hdr->holes = 1;         /* mark as needing compaction */
+        }
+        xfs_da_log_buf(args->trans, bp,
+                          XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+        /*
+         * Check if leaf is less than 50% full, caller may want to
+         * "join" the leaf with a sibling if so.
+         */
+        tmp  = sizeof(xfs_attr_leaf_hdr_t);
+        tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT)
+                                        * sizeof(xfs_attr_leaf_entry_t);
+        tmp += INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+        return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+}
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+                                       xfs_da_state_blk_t *save_blk)
+{
+        xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+        xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+        xfs_mount_t *mp;
+        char *tmpbuffer;
+        /*
+         * Set up environment.
+         */
+        mp = state->mp;
+        ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+        drop_leaf = drop_blk->bp->data;
+        save_leaf = save_blk->bp->data;
+        ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        drop_hdr = &drop_leaf->hdr;
+        save_hdr = &save_leaf->hdr;
+        /*
+         * Save last hashval from dying block for later Btree fixup.
+         */
+        drop_blk->hashval =
+                INT_GET(drop_leaf->entries[INT_GET(drop_leaf->hdr.count,
+                                                ARCH_CONVERT)-1].hashval,
+                                                                ARCH_CONVERT);
+        /*
+         * Check if we need a temp buffer, or can we do it in place.
+         * Note that we don't check "leaf" for holes because we will
+         * always be dropping it, toosmall() decided that for us already.
+         */
+        if (save_hdr->holes == 0) {
+                /*
+                 * dest leaf has no holes, so we add there.  May need
+                 * to make some room in the entry array.
+                 */
+                if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+                        xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+                             (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+                } else {
+                        xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
+                                  INT_GET(save_hdr->count, ARCH_CONVERT),
+                                  (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+                                  mp);
+                }
+        } else {
+                /*
+                 * Destination has holes, so we make a temporary copy
+                 * of the leaf and add them both to that.
+                 */
+                tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+                ASSERT(tmpbuffer != NULL);
+                memset(tmpbuffer, 0, state->blocksize);
+                tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+                tmp_hdr = &tmp_leaf->hdr;
+                tmp_hdr->info = save_hdr->info; /* struct copy */
+                tmp_hdr->count = 0;
+                INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
+                if (!tmp_hdr->firstused) {
+                        INT_SET(tmp_hdr->firstused, ARCH_CONVERT,
+                                state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
+                }
+                tmp_hdr->usedbytes = 0;
+                if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+                        xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+                                (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+                                mp);
+                        xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
+                                  INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+                                 (int)INT_GET(save_hdr->count, ARCH_CONVERT),
+                                 mp);
+                } else {
+                        xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+                                (int)INT_GET(save_hdr->count, ARCH_CONVERT),
+                                mp);
+                        xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
+                                INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+                                (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+                                mp);
+                }
+                memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
+                kmem_free(tmpbuffer, state->blocksize);
+        }
+        xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+                                           state->blocksize - 1);
+        /*
+         * Copy out last hashval in each block for B-tree code.
+         */
+        save_blk->hashval =
+                INT_GET(save_leaf->entries[INT_GET(save_leaf->hdr.count,
+                                                ARCH_CONVERT)-1].hashval,
+                                                                ARCH_CONVERT);
+}
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_local_t *name_loc;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        int probe, span;
+        xfs_dahash_t hashval;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
+                                        < (XFS_LBSIZE(args->dp->i_mount)/8));
+        /*
+         * Binary search.  (note: small blocks will skip this loop)
+         */
+        hashval = args->hashval;
+        probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
+        for (entry = &leaf->entries[probe]; span > 4;
+                   entry = &leaf->entries[probe]) {
+                span /= 2;
+                if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+                        probe += span;
+                else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+                        probe -= span;
+                else
+                        break;
+        }
+        ASSERT((probe >= 0) && 
+               (!leaf->hdr.count
+               || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
+        ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT)
+                                                        == hashval));
+        /*
+         * Since we may have duplicate hashval's, find the first matching
+         * hashval in the leaf.
+         */
+        while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT)
+                                                        >= hashval)) {
+                entry--;
+                probe--;
+        }
+        while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+                && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+                entry++;
+                probe++;
+        }
+        if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT))
+                    || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+                args->index = probe;
+                return(XFS_ERROR(ENOATTR));
+        }
+        /*
+         * Duplicate keys may be present, so search all of them for a match.
+         */
+        for (  ; (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+                        && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval);
+                        entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+                /*
+                 * If we are looking for INCOMPLETE entries, show only those.
+                 * If we are looking for complete entries, show only those.
+                 */
+                if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+                    (entry->flags & XFS_ATTR_INCOMPLETE)) {
+                        continue;
+                }
+                if (entry->flags & XFS_ATTR_LOCAL) {
+                        name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
+                        if (name_loc->namelen != args->namelen)
+                                continue;
+                        if (memcmp(args->name, (char *)name_loc->nameval,
+                                             args->namelen) != 0)
+                                continue;
+                        if (((args->flags & ATTR_SECURE) != 0) !=
+                            ((entry->flags & XFS_ATTR_SECURE) != 0))
+                                continue;
+                        if (((args->flags & ATTR_ROOT) != 0) !=
+                            ((entry->flags & XFS_ATTR_ROOT) != 0))
+                                continue;
+                        args->index = probe;
+                        return(XFS_ERROR(EEXIST));
+                } else {
+                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
+                        if (name_rmt->namelen != args->namelen)
+                                continue;
+                        if (memcmp(args->name, (char *)name_rmt->name,
+                                             args->namelen) != 0)
+                                continue;
+                        if (((args->flags & ATTR_SECURE) != 0) !=
+                            ((entry->flags & XFS_ATTR_SECURE) != 0))
+                                continue;
+                        if (((args->flags & ATTR_ROOT) != 0) !=
+                            ((entry->flags & XFS_ATTR_ROOT) != 0))
+                                continue;
+                        args->index = probe;
+                        args->rmtblkno
+                                  = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
+                        args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
+                                                   INT_GET(name_rmt->valuelen,
+                                                                ARCH_CONVERT));
+                        return(XFS_ERROR(EEXIST));
+                }
+        }
+        args->index = probe;
+        return(XFS_ERROR(ENOATTR));
+}
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+        int valuelen;
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_local_t *name_loc;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
+                                        < (XFS_LBSIZE(args->dp->i_mount)/8));
+        ASSERT(args->index < ((int)INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+        entry = &leaf->entries[args->index];
+        if (entry->flags & XFS_ATTR_LOCAL) {
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+                ASSERT(name_loc->namelen == args->namelen);
+                ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
+                valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
+                if (args->flags & ATTR_KERNOVAL) {
+                        args->valuelen = valuelen;
+                        return(0);
+                }
+                if (args->valuelen < valuelen) {
+                        args->valuelen = valuelen;
+                        return(XFS_ERROR(ERANGE));
+                }
+                args->valuelen = valuelen;
+                memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
+        } else {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                ASSERT(name_rmt->namelen == args->namelen);
+                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
+                valuelen = INT_GET(name_rmt->valuelen, ARCH_CONVERT);
+                args->rmtblkno = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
+                args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+                if (args->flags & ATTR_KERNOVAL) {
+                        args->valuelen = valuelen;
+                        return(0);
+                }
+                if (args->valuelen < valuelen) {
+                        args->valuelen = valuelen;
+                        return(XFS_ERROR(ERANGE));
+                }
+                args->valuelen = valuelen;
+        }
+        return(0);
+}
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
+                        xfs_attr_leafblock_t *leaf_d, int start_d,
+                        int count, xfs_mount_t *mp)
+{
+        xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+        xfs_attr_leaf_entry_t *entry_s, *entry_d;
+        int desti, tmp, i;
+        /*
+         * Check for nothing to do.
+         */
+        if (count == 0)
+                return;
+        /*
+         * Set up environment.
+         */
+        ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        hdr_s = &leaf_s->hdr;
+        hdr_d = &leaf_d->hdr;
+        ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0)
+                                && (INT_GET(hdr_s->count, ARCH_CONVERT)
+                                                < (XFS_LBSIZE(mp)/8)));
+        ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
+                ((INT_GET(hdr_s->count, ARCH_CONVERT)
+                                        * sizeof(*entry_s))+sizeof(*hdr_s)));
+        ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
+        ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
+                ((INT_GET(hdr_d->count, ARCH_CONVERT)
+                                        * sizeof(*entry_d))+sizeof(*hdr_d)));
+        ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
+        ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
+        ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+        /*
+         * Move the entries in the destination leaf up to make a hole?
+         */
+        if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
+                tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+                tmp *= sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &leaf_d->entries[start_d];
+                entry_d = &leaf_d->entries[start_d + count];
+                memmove((char *)entry_d, (char *)entry_s, tmp);
+        }
+        /*
+         * Copy all entry's in the same (sorted) order,
+         * but allocate attribute info packed and in sequence.
+         */
+        entry_s = &leaf_s->entries[start_s];
+        entry_d = &leaf_d->entries[start_d];
+        desti = start_d;
+        for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+                ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT)
+                                >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+                tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+                /*
+                 * Code to drop INCOMPLETE entries.  Difficult to use as we
+                 * may also need to change the insertion index.  Code turned
+                 * off for 6.2, should be revisited later.
+                 */
+                if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+                        memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+                        INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
+                        INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+                        entry_d--;      /* to compensate for ++ in loop hdr */
+                        desti--;
+                        if ((start_s + i) < offset)
+                                result++;       /* insertion index adjustment */
+                } else {
+#endif /* GROT */
+                        INT_MOD(hdr_d->firstused, ARCH_CONVERT, -tmp);
+                        /* both on-disk, don't endian flip twice */
+                        entry_d->hashval = entry_s->hashval;
+                        /* both on-disk, don't endian flip twice */
+                        entry_d->nameidx = hdr_d->firstused;
+                        entry_d->flags = entry_s->flags;
+                        ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp
+                                                        <= XFS_LBSIZE(mp));
+                        memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti),
+                                XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+                        ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp
+                                                        <= XFS_LBSIZE(mp));
+                        memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+                        INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
+                        INT_MOD(hdr_d->usedbytes, ARCH_CONVERT, tmp);
+                        INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+                        INT_MOD(hdr_d->count, ARCH_CONVERT, 1);
+                        tmp = INT_GET(hdr_d->count, ARCH_CONVERT)
+                                                * sizeof(xfs_attr_leaf_entry_t)
+                                                + sizeof(xfs_attr_leaf_hdr_t);
+                        ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+#ifdef GROT
+                }
+#endif /* GROT */
+        }
+        /*
+         * Zero out the entries we just copied.
+         */
+        if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+                tmp = count * sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &leaf_s->entries[start_s];
+                ASSERT(((char *)entry_s + tmp) <=
+                       ((char *)leaf_s + XFS_LBSIZE(mp)));
+                memset((char *)entry_s, 0, tmp);
+        } else {
+                /*
+                 * Move the remaining entries down to fill the hole,
+                 * then zero the entries at the top.
+                 */
+                tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
+                tmp *= sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &leaf_s->entries[start_s + count];
+                entry_d = &leaf_s->entries[start_s];
+                memmove((char *)entry_d, (char *)entry_s, tmp);
+                tmp = count * sizeof(xfs_attr_leaf_entry_t);
+                entry_s = &leaf_s->entries[INT_GET(hdr_s->count,
+                                                        ARCH_CONVERT)];
+                ASSERT(((char *)entry_s + tmp) <=
+                       ((char *)leaf_s + XFS_LBSIZE(mp)));
+                memset((char *)entry_s, 0, tmp);
+        }
+        /*
+         * Fill in the freemap information
+         */
+        INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
+                                        sizeof(xfs_attr_leaf_hdr_t));
+        INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT,
+                                INT_GET(hdr_d->count, ARCH_CONVERT)
+                                        * sizeof(xfs_attr_leaf_entry_t));
+        INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
+                                INT_GET(hdr_d->firstused, ARCH_CONVERT)
+                              - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+        hdr_d->freemap[1].base = 0;
+        hdr_d->freemap[2].base = 0;
+        hdr_d->freemap[1].size = 0;
+        hdr_d->freemap[2].size = 0;
+        hdr_s->holes = 1;       /* leaf may not be compact */
+}
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+int
+xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+        xfs_attr_leafblock_t *leaf1, *leaf2;
+        leaf1 = leaf1_bp->data;
+        leaf2 = leaf2_bp->data;
+        ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC) &&
+               (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC));
+        if (   (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0)
+            && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0)
+            && (   (INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
+                      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT))
+                || (INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
+                                ARCH_CONVERT)-1].hashval, ARCH_CONVERT) <
+                      INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
+                                ARCH_CONVERT)-1].hashval, ARCH_CONVERT))) ) {
+                return(1);
+        }
+        return(0);
+}
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+        xfs_attr_leafblock_t *leaf;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        if (count)
+                *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+        if (!leaf->hdr.count)
+                return(0);
+        return(INT_GET(leaf->entries[INT_GET(leaf->hdr.count,
+                                ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+        xfs_attr_leaf_name_local_t *name_loc;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        int size;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
+                size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
+                                                   INT_GET(name_loc->valuelen,
+                                                                ARCH_CONVERT));
+        } else {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
+                size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
+        }
+        return(size);
+}
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(xfs_da_args_t *args, int blocksize, int *local)
+{
+        int size;
+        size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(args->namelen, args->valuelen);
+        if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
+                if (local) {
+                        *local = 1;
+                }
+        } else {
+                size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(args->namelen);
+                if (local) {
+                        *local = 0;
+                }
+        }
+        return(size);
+}
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
+{
+        attrlist_cursor_kern_t *cursor;
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_local_t *name_loc;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        int retval, i;
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        cursor = context->cursor;
+        cursor->initted = 1;
+        xfs_attr_trace_l_cl("blk start", context, leaf);
+        /*
+         * Re-find our place in the leaf block if this is a new syscall.
+         */
+        if (context->resynch) {
+                entry = &leaf->entries[0];
+                for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                                                        entry++, i++) {
+                        if (INT_GET(entry->hashval, ARCH_CONVERT)
+                                                        == cursor->hashval) {
+                                if (cursor->offset == context->dupcnt) {
+                                        context->dupcnt = 0;
+                                        break;
+                                }
+                                context->dupcnt++;
+                        } else if (INT_GET(entry->hashval, ARCH_CONVERT)
+                                                        > cursor->hashval) {
+                                context->dupcnt = 0;
+                                break;
+                        }
+                }
+                if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+                        xfs_attr_trace_l_c("not found", context);
+                        return(0);
+                }
+        } else {
+                entry = &leaf->entries[0];
+                i = 0;
+        }
+        context->resynch = 0;
+        /*
+         * We have found our place, start copying out the new attributes.
+         */
+        retval = 0;
+        for (  ; (i < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+             && (retval == 0); entry++, i++) {
+                attrnames_t     *namesp;
+                if (INT_GET(entry->hashval, ARCH_CONVERT) != cursor->hashval) {
+                        cursor->hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+                        cursor->offset = 0;
+                }
+                if (entry->flags & XFS_ATTR_INCOMPLETE)
+                        continue;               /* skip incomplete entries */
+                if (((context->flags & ATTR_SECURE) != 0) !=
+                    ((entry->flags & XFS_ATTR_SECURE) != 0) &&
+                    !(context->flags & ATTR_KERNORMALS))
+                        continue;               /* skip non-matching entries */
+                if (((context->flags & ATTR_ROOT) != 0) !=
+                    ((entry->flags & XFS_ATTR_ROOT) != 0) &&
+                    !(context->flags & ATTR_KERNROOTLS))
+                        continue;               /* skip non-matching entries */
+                namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure :
+                        ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted :
+                          &attr_user);
+                if (entry->flags & XFS_ATTR_LOCAL) {
+                        name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+                        if (context->flags & ATTR_KERNOVAL) {
+                                ASSERT(context->flags & ATTR_KERNAMELS);
+                                context->count += namesp->attr_namelen +
+                                                (int)name_loc->namelen + 1;
+                        } else {
+                                retval = xfs_attr_put_listent(context, namesp,
+                                        (char *)name_loc->nameval,
+                                        (int)name_loc->namelen,
+                                        (int)INT_GET(name_loc->valuelen,
+                                                                ARCH_CONVERT));
+                        }
+                } else {
+                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+                        if (context->flags & ATTR_KERNOVAL) {
+                                ASSERT(context->flags & ATTR_KERNAMELS);
+                                context->count += namesp->attr_namelen +
+                                                (int)name_rmt->namelen + 1;
+                        } else {
+                                retval = xfs_attr_put_listent(context, namesp,
+                                        (char *)name_rmt->name,
+                                        (int)name_rmt->namelen,
+                                        (int)INT_GET(name_rmt->valuelen,
+                                                                ARCH_CONVERT));
+                        }
+                }
+                if (retval == 0) {
+                        cursor->offset++;
+                }
+        }
+        xfs_attr_trace_l_cl("blk end", context, leaf);
+        return(retval);
+}
+#define ATTR_ENTBASESIZE                /* minimum bytes used by an attr */ \
+        (((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define ATTR_ENTSIZE(namelen)           /* actual bytes used by an attr */ \
+        ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+         & ~(sizeof(u_int32_t)-1))
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_put_listent(xfs_attr_list_context_t *context,
+                     attrnames_t *namesp, char *name, int namelen, int valuelen)
+{
+        attrlist_ent_t *aep;
+        int arraytop;
+        ASSERT(!(context->flags & ATTR_KERNOVAL));
+        if (context->flags & ATTR_KERNAMELS) {
+                char *offset;
+                ASSERT(context->count >= 0);
+                arraytop = context->count + namesp->attr_namelen + namelen + 1;
+                if (arraytop > context->firstu) {
+                        context->count = -1;    /* insufficient space */
+                        return(1);
+                }
+                offset = (char *)context->alist + context->count;
+                strncpy(offset, namesp->attr_name, namesp->attr_namelen);
+                offset += namesp->attr_namelen;
+                strncpy(offset, name, namelen);                 /* real name */
+                offset += namelen;
+                *offset = '\0';
+                context->count += namesp->attr_namelen + namelen + 1;
+                return(0);
+        }
+        ASSERT(context->count >= 0);
+        ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+        ASSERT(context->firstu >= sizeof(*context->alist));
+        ASSERT(context->firstu <= context->bufsize);
+        arraytop = sizeof(*context->alist) +
+                        context->count * sizeof(context->alist->al_offset[0]);
+        context->firstu -= ATTR_ENTSIZE(namelen);
+        if (context->firstu < arraytop) {
+                xfs_attr_trace_l_c("buffer full", context);
+                context->alist->al_more = 1;
+                return(1);
+        }
+        aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+        aep->a_valuelen = valuelen;
+        memcpy(aep->a_name, name, namelen);
+        aep->a_name[ namelen ] = 0;
+        context->alist->al_offset[ context->count++ ] = context->firstu;
+        context->alist->al_count = context->count;
+        xfs_attr_trace_l_c("add", context);
+        return(0);
+}
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        xfs_dabuf_t *bp;
+        int error;
+#ifdef DEBUG
+        xfs_attr_leaf_name_local_t *name_loc;
+        int namelen;
+        char *name;
+#endif /* DEBUG */
+        /*
+         * Set up the operation.
+         */
+        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+                                             XFS_ATTR_FORK);
+        if (error) {
+                return(error);
+        }
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
+        ASSERT(args->index >= 0);
+        entry = &leaf->entries[ args->index ];
+        ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+#ifdef DEBUG
+        if (entry->flags & XFS_ATTR_LOCAL) {
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+                namelen = name_loc->namelen;
+                name = (char *)name_loc->nameval;
+        } else {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                namelen = name_rmt->namelen;
+                name = (char *)name_rmt->name;
+        }
+        ASSERT(INT_GET(entry->hashval, ARCH_CONVERT) == args->hashval);
+        ASSERT(namelen == args->namelen);
+        ASSERT(memcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+        entry->flags &= ~XFS_ATTR_INCOMPLETE;
+        xfs_da_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+        if (args->rmtblkno) {
+                ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
+                INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
+                xfs_da_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+        }
+        xfs_da_buf_done(bp);
+        /*
+         * Commit the flag value change and start the next trans in series.
+         */
+        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return(error);
+}
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_setflag(xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        xfs_dabuf_t *bp;
+        int error;
+        /*
+         * Set up the operation.
+         */
+        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+                                             XFS_ATTR_FORK);
+        if (error) {
+                return(error);
+        }
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
+        ASSERT(args->index >= 0);
+        entry = &leaf->entries[ args->index ];
+        ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+        entry->flags |= XFS_ATTR_INCOMPLETE;
+        xfs_da_log_buf(args->trans, bp,
+                        XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+        if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+                name_rmt->valueblk = 0;
+                name_rmt->valuelen = 0;
+                xfs_da_log_buf(args->trans, bp,
+                         XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+        }
+        xfs_da_buf_done(bp);
+        /*
+         * Commit the flag value change and start the next trans in series.
+         */
+        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return(error);
+}
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+{
+        xfs_attr_leafblock_t *leaf1, *leaf2;
+        xfs_attr_leaf_entry_t *entry1, *entry2;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        xfs_dabuf_t *bp1, *bp2;
+        int error;
+#ifdef DEBUG
+        xfs_attr_leaf_name_local_t *name_loc;
+        int namelen1, namelen2;
+        char *name1, *name2;
+#endif /* DEBUG */
+        /*
+         * Read the block containing the "old" attr
+         */
+        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
+                                             XFS_ATTR_FORK);
+        if (error) {
+                return(error);
+        }
+        ASSERT(bp1 != NULL);
+        /*
+         * Read the block containing the "new" attr, if it is different
+         */
+        if (args->blkno2 != args->blkno) {
+                error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
+                                        -1, &bp2, XFS_ATTR_FORK);
+                if (error) {
+                        return(error);
+                }
+                ASSERT(bp2 != NULL);
+        } else {
+                bp2 = bp1;
+        }
+        leaf1 = bp1->data;
+        ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(args->index < INT_GET(leaf1->hdr.count, ARCH_CONVERT));
+        ASSERT(args->index >= 0);
+        entry1 = &leaf1->entries[ args->index ];
+        leaf2 = bp2->data;
+        ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(args->index2 < INT_GET(leaf2->hdr.count, ARCH_CONVERT));
+        ASSERT(args->index2 >= 0);
+        entry2 = &leaf2->entries[ args->index2 ];
+#ifdef DEBUG
+        if (entry1->flags & XFS_ATTR_LOCAL) {
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
+                namelen1 = name_loc->namelen;
+                name1 = (char *)name_loc->nameval;
+        } else {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+                namelen1 = name_rmt->namelen;
+                name1 = (char *)name_rmt->name;
+        }
+        if (entry2->flags & XFS_ATTR_LOCAL) {
+                name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
+                namelen2 = name_loc->namelen;
+                name2 = (char *)name_loc->nameval;
+        } else {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+                namelen2 = name_rmt->namelen;
+                name2 = (char *)name_rmt->name;
+        }
+        ASSERT(INT_GET(entry1->hashval, ARCH_CONVERT) == INT_GET(entry2->hashval, ARCH_CONVERT));
+        ASSERT(namelen1 == namelen2);
+        ASSERT(memcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+        ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+        ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+        entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+        xfs_da_log_buf(args->trans, bp1,
+                          XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+        if (args->rmtblkno) {
+                ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+                INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
+                INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
+                xfs_da_log_buf(args->trans, bp1,
+                         XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+        }
+        entry2->flags |= XFS_ATTR_INCOMPLETE;
+        xfs_da_log_buf(args->trans, bp2,
+                          XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+        if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+                name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+                name_rmt->valueblk = 0;
+                name_rmt->valuelen = 0;
+                xfs_da_log_buf(args->trans, bp2,
+                         XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+        }
+        xfs_da_buf_done(bp1);
+        if (bp1 != bp2)
+                xfs_da_buf_done(bp2);
+        /*
+         * Commit the flag value change and start the next trans in series.
+         */
+        error = xfs_attr_rolltrans(&args->trans, args->dp);
+        return(error);
+}
+/*========================================================================
+ * Indiscriminately delete the entire attribute fork
+ *========================================================================*/
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
+{
+        xfs_da_blkinfo_t *info;
+        xfs_daddr_t blkno;
+        xfs_dabuf_t *bp;
+        int error;
+        /*
+         * Read block 0 to see what we have to work with.
+         * We only get here if we have extents, since we remove
+         * the extents in reverse order the extent containing
+         * block 0 must still be there.
+         */
+        error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+        if (error)
+                return(error);
+        blkno = xfs_da_blkno(bp);
+        /*
+         * Invalidate the tree, even if the "tree" is only a single leaf block.
+         * This is a depth-first traversal!
+         */
+        info = bp->data;
+        if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+                error = xfs_attr_node_inactive(trans, dp, bp, 1);
+        } else if (INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
+                error = xfs_attr_leaf_inactive(trans, dp, bp);
+        } else {
+                error = XFS_ERROR(EIO);
+                xfs_da_brelse(*trans, bp);
+        }
+        if (error)
+                return(error);
+        /*
+         * Invalidate the incore copy of the root block.
+         */
+        error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+        if (error)
+                return(error);
+        xfs_da_binval(*trans, bp);      /* remove from cache */
+        /*
+         * Commit the invalidate and start the next transaction.
+         */
+        error = xfs_attr_rolltrans(trans, dp);
+        return (error);
+}
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
+                                   int level)
+{
+        xfs_da_blkinfo_t *info;
+        xfs_da_intnode_t *node;
+        xfs_dablk_t child_fsb;
+        xfs_daddr_t parent_blkno, child_blkno;
+        int error, count, i;
+        xfs_dabuf_t *child_bp;
+        /*
+         * Since this code is recursive (gasp!) we must protect ourselves.
+         */
+        if (level > XFS_DA_NODE_MAXDEPTH) {
+                xfs_da_brelse(*trans, bp);      /* no locks for later trans */
+                return(XFS_ERROR(EIO));
+        }
+        node = bp->data;
+        ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_DA_NODE_MAGIC);
+        parent_blkno = xfs_da_blkno(bp);        /* save for re-read later */
+        count = INT_GET(node->hdr.count, ARCH_CONVERT);
+        if (!count) {
+                xfs_da_brelse(*trans, bp);
+                return(0);
+        }
+        child_fsb = INT_GET(node->btree[0].before, ARCH_CONVERT);
+        xfs_da_brelse(*trans, bp);      /* no locks for later trans */
+        /*
+         * If this is the node level just above the leaves, simply loop
+         * over the leaves removing all of them.  If this is higher up
+         * in the tree, recurse downward.
+         */
+        for (i = 0; i < count; i++) {
+                /*
+                 * Read the subsidiary block to see what we have to work with.
+                 * Don't do this in a transaction.  This is a depth-first
+                 * traversal of the tree so we may deal with many blocks
+                 * before we come back to this one.
+                 */
+                error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+                                                XFS_ATTR_FORK);
+                if (error)
+                        return(error);
+                if (child_bp) {
+                                                /* save for re-read later */
+                        child_blkno = xfs_da_blkno(child_bp);
+                        /*
+                         * Invalidate the subtree, however we have to.
+                         */
+                        info = child_bp->data;
+                        if (INT_GET(info->magic, ARCH_CONVERT)
+                                                        == XFS_DA_NODE_MAGIC) {
+                                error = xfs_attr_node_inactive(trans, dp,
+                                                child_bp, level+1);
+                        } else if (INT_GET(info->magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC) {
+                                error = xfs_attr_leaf_inactive(trans, dp,
+                                                child_bp);
+                        } else {
+                                error = XFS_ERROR(EIO);
+                                xfs_da_brelse(*trans, child_bp);
+                        }
+                        if (error)
+                                return(error);
+                        /*
+                         * Remove the subsidiary block from the cache
+                         * and from the log.
+                         */
+                        error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+                                &child_bp, XFS_ATTR_FORK);
+                        if (error)
+                                return(error);
+                        xfs_da_binval(*trans, child_bp);
+                }
+                /*
+                 * If we're not done, re-read the parent to get the next
+                 * child block number.
+                 */
+                if ((i+1) < count) {
+                        error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
+                                &bp, XFS_ATTR_FORK);
+                        if (error)
+                                return(error);
+                        child_fsb = INT_GET(node->btree[i+1].before, ARCH_CONVERT);
+                        xfs_da_brelse(*trans, bp);
+                }
+                /*
+                 * Atomically commit the whole invalidate stuff.
+                 */
+                if ((error = xfs_attr_rolltrans(trans, dp)))
+                        return (error);
+        }
+        return(0);
+}
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+int
+xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
+{
+        xfs_attr_leafblock_t *leaf;
+        xfs_attr_leaf_entry_t *entry;
+        xfs_attr_leaf_name_remote_t *name_rmt;
+        xfs_attr_inactive_list_t *list, *lp;
+        int error, count, size, tmp, i;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+                                                == XFS_ATTR_LEAF_MAGIC);
+        /*
+         * Count the number of "remote" value extents.
+         */
+        count = 0;
+        entry = &leaf->entries[0];
+        for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+                if (   INT_GET(entry->nameidx, ARCH_CONVERT)
+                    && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+                        if (name_rmt->valueblk)
+                                count++;
+                }
+        }
+        /*
+         * If there are no "remote" values, we're done.
+         */
+        if (count == 0) {
+                xfs_da_brelse(*trans, bp);
+                return(0);
+        }
+        /*
+         * Allocate storage for a list of all the "remote" value extents.
+         */
+        size = count * sizeof(xfs_attr_inactive_list_t);
+        list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
+        /*
+         * Identify each of the "remote" value extents.
+         */
+        lp = list;
+        entry = &leaf->entries[0];
+        for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+                if (   INT_GET(entry->nameidx, ARCH_CONVERT)
+                    && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+                        name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+                        if (name_rmt->valueblk) {
+                                /* both on-disk, don't endian flip twice */
+                                lp->valueblk = name_rmt->valueblk;
+                                INT_SET(lp->valuelen, ARCH_CONVERT,
+                                                XFS_B_TO_FSB(dp->i_mount,
+                                                    INT_GET(name_rmt->valuelen,
+                                                              ARCH_CONVERT)));
+                                lp++;
+                        }
+                }
+        }
+        xfs_da_brelse(*trans, bp);      /* unlock for trans. in freextent() */
+        /*
+         * Invalidate each of the "remote" value extents.
+         */
+        error = 0;
+        for (lp = list, i = 0; i < count; i++, lp++) {
+                tmp = xfs_attr_leaf_freextent(trans, dp,
+                                                     INT_GET(lp->valueblk,
+                                                                ARCH_CONVERT),
+                                                     INT_GET(lp->valuelen,
+                                                                ARCH_CONVERT));
+                if (error == 0)
+                        error = tmp;    /* save only the 1st errno */
+        }
+        kmem_free((xfs_caddr_t)list, size);
+        return(error);
+}
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+int
+xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+                                    xfs_dablk_t blkno, int blkcnt)
+{
+        xfs_bmbt_irec_t map;
+        xfs_dablk_t tblkno;
+        int tblkcnt, dblkcnt, nmap, error;
+        xfs_daddr_t dblkno;
+        xfs_buf_t *bp;
+        /*
+         * Roll through the "value", invalidating the attribute value's
+         * blocks.
+         */
+        tblkno = blkno;
+        tblkcnt = blkcnt;
+        while (tblkcnt > 0) {
+                /*
+                 * Try to remember where we decided to put the value.
+                 */
+                nmap = 1;
+                error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
+                                        XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+                                        NULL, 0, &map, &nmap, NULL);
+                if (error) {
+                        return(error);
+                }
+                ASSERT(nmap == 1);
+                ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+                /*
+                 * If it's a hole, these are already unmapped
+                 * so there's nothing to invalidate.
+                 */
+                if (map.br_startblock != HOLESTARTBLOCK) {
+                        dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+                                                  map.br_startblock);
+                        dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+                                                map.br_blockcount);
+                        bp = xfs_trans_get_buf(*trans,
+                                        dp->i_mount->m_ddev_targp,
+                                        dblkno, dblkcnt, XFS_BUF_LOCK);
+                        xfs_trans_binval(*trans, bp);
+                        /*
+                         * Roll to next transaction.
+                         */
+                        if ((error = xfs_attr_rolltrans(trans, dp)))
+                                return (error);
+                }
+                tblkno += map.br_blockcount;
+                tblkcnt -= map.br_blockcount;
+        }
+        return(0);
+}
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to the next.
+ */
+int
+xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
+{
+        xfs_trans_t *trans;
+        unsigned int logres, count;
+        int     error;
+        /*
+         * Ensure that the inode is always logged.
+         */
+        trans = *transp;
+        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+        /*
+         * Copy the critical parameters from one trans to the next.
+         */
+        logres = trans->t_log_res;
+        count = trans->t_log_count;
+        *transp = xfs_trans_dup(trans);
+        /*
+         * Commit the current transaction.
+         * If this commit failed, then it'd just unlock those items that
+         * are not marked ihold. That also means that a filesystem shutdown
+         * is in progress. The caller takes the responsibility to cancel
+         * the duplicate transaction that gets returned.
+         */
+        if ((error = xfs_trans_commit(trans, 0, NULL)))
+                return (error);
+        trans = *transp;
+        /*
+         * Reserve space in the log for th next transaction.
+         * This also pushes items in the "AIL", the list of logged items,
+         * out to disk if they are taking up space at the tail of the log
+         * that we want to use.  This requires that either nothing be locked
+         * across this call, or that anything that is locked be logged in
+         * the prior and the next transactions.
+         */
+        error = xfs_trans_reserve(trans, 0, logres, 0,
+                                  XFS_TRANS_PERM_LOG_RES, count);
+        /*
+         *  Ensure that the inode is in the new transaction and locked.
+         */
+        if (!error) {
+                xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+                xfs_trans_ihold(trans, dp);
+        }
+        return (error);
+}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
new file mode 100644
index 000000000000..b1480e0b3349
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2000, 2002-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define __XFS_ATTR_LEAF_H__
+/*
+ * Attribute storage layout, internal structure, access macros, etc.
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+struct attrlist;
+struct attrlist_cursor_kern;
+struct attrnames;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+/*========================================================================
+ * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top.  Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store a "incomplete" bit in the leaf_entry.  It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.  We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE   3       /* how many freespace slots */
+typedef struct xfs_attr_leafblock {
+        struct xfs_attr_leaf_hdr {      /* constant-structure header block */
+                xfs_da_blkinfo_t info;  /* block type, links, etc. */
+                __uint16_t count;       /* count of active leaf_entry's */
+                __uint16_t usedbytes;   /* num bytes of names/values stored */
+                __uint16_t firstused;   /* first used byte in name area */
+                __uint8_t  holes;       /* != 0 if blk needs compaction */
+                __uint8_t  pad1;
+                struct xfs_attr_leaf_map {        /* RLE map of free bytes */
+                        __uint16_t base;          /* base of free region */
+                        __uint16_t size;          /* length of free region */
+                } freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */
+        } hdr;
+        struct xfs_attr_leaf_entry {    /* sorted on key, not name */
+                xfs_dahash_t hashval;   /* hash value of name */
+                __uint16_t nameidx;     /* index into buffer of name/value */
+                __uint8_t flags;        /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+                __uint8_t pad2;         /* unused pad byte */
+        } entries[1];                   /* variable sized array */
+        struct xfs_attr_leaf_name_local {
+                __uint16_t valuelen;    /* number of bytes in value */
+                __uint8_t namelen;      /* length of name bytes */
+                __uint8_t nameval[1];   /* name/value bytes */
+        } namelist;                     /* grows from bottom of buf */
+        struct xfs_attr_leaf_name_remote {
+                xfs_dablk_t valueblk;   /* block number of value bytes */
+                __uint32_t valuelen;    /* number of bytes in value */
+                __uint8_t namelen;      /* length of name bytes */
+                __uint8_t name[1];      /* name bytes */
+        } valuelist;                    /* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t;
+typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t;
+typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t;
+typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t;
+typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t;
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define XFS_ATTR_LOCAL_BIT      0       /* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT       1       /* limit access to trusted attrs */
+#define XFS_ATTR_SECURE_BIT     2       /* limit access to secure attrs */
+#define XFS_ATTR_INCOMPLETE_BIT 7       /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL          (1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT           (1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE         (1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE     (1 << XFS_ATTR_INCOMPLETE_BIT)
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN        ((uint)sizeof(xfs_dablk_t))
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
+xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)    \
+        xfs_attr_leaf_name_remote(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)    /* remote name struct ptr */ \
+        ((xfs_attr_leaf_name_remote_t *)                \
+         &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
+xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)     \
+        xfs_attr_leaf_name_local(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)     /* local name struct ptr */ \
+        ((xfs_attr_leaf_name_local_t *)         \
+         &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME)
+char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME(leafp,idx)           xfs_attr_leaf_name(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME(leafp,idx)           /* generic name struct ptr */ \
+        (&((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
+int xfs_attr_leaf_entsize_remote(int nlen);
+#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)      \
+        xfs_attr_leaf_entsize_remote(nlen)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)      /* space for remote struct */ \
+        (((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+          XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
+int xfs_attr_leaf_entsize_local(int nlen, int vlen);
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)  \
+        xfs_attr_leaf_entsize_local(nlen,vlen)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)  /* space for local struct */ \
+        (((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + \
+          XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
+int xfs_attr_leaf_entsize_local_max(int bsize);
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)  \
+        xfs_attr_leaf_entsize_local_max(bsize)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)  /* max local struct size */ \
+        (((bsize) >> 1) + ((bsize) >> 2))
+#endif
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+typedef struct xfs_attr_list_context {
+        struct xfs_inode                *dp;    /* inode */
+        struct attrlist_cursor_kern     *cursor;/* position in list */
+        struct attrlist                 *alist; /* output buffer */
+        int                             count;  /* num used entries */
+        int                             dupcnt; /* count dup hashvals seen */
+        int                             bufsize;/* total buffer size */
+        int                             firstu; /* first used byte in buffer */
+        int                             flags;  /* from VOP call */
+        int                             resynch;/* T/F: resynch with cursor */
+} xfs_attr_list_context_t;
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+        xfs_dablk_t     valueblk;       /* block number of value bytes */
+        int             valuelen;       /* number of bytes in value */
+} xfs_attr_inactive_list_t;
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Internal routines when dirsize < XFS_LITINO(mp).
+ */
+int     xfs_attr_shortform_create(struct xfs_da_args *args);
+int     xfs_attr_shortform_add(struct xfs_da_args *add);
+int     xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int     xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int     xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int     xfs_attr_shortform_remove(struct xfs_da_args *remove);
+int     xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int     xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+int     xfs_attr_leaf_to_node(struct xfs_da_args *args);
+int     xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
+                                          struct xfs_da_args *args);
+int     xfs_attr_leaf_clearflag(struct xfs_da_args *args);
+int     xfs_attr_leaf_setflag(struct xfs_da_args *args);
+int     xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+/*
+ * Routines used for growing the Btree.
+ */
+int     xfs_attr_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
+                                    struct xfs_dabuf **bpp);
+int     xfs_attr_leaf_split(struct xfs_da_state *state,
+                                   struct xfs_da_state_blk *oldblk,
+                                   struct xfs_da_state_blk *newblk);
+int     xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
+                                        struct xfs_da_args *args);
+int     xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
+int     xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
+                                 struct xfs_da_args *args);
+int     xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
+                                    struct xfs_da_args *args);
+int     xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
+                                      struct xfs_attr_list_context *context);
+/*
+ * Routines used for shrinking the Btree.
+ */
+int     xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void    xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+                                       struct xfs_da_state_blk *drop_blk,
+                                       struct xfs_da_state_blk *save_blk);
+int     xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+int     xfs_attr_node_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
+                                      struct xfs_dabuf *bp, int level);
+int     xfs_attr_leaf_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
+                                      struct xfs_dabuf *bp);
+int     xfs_attr_leaf_freextent(struct xfs_trans **trans, struct xfs_inode *dp,
+                                       xfs_dablk_t blkno, int blkcnt);
+/*
+ * Utility routines.
+ */
+xfs_dahash_t    xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int     xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
+                                   struct xfs_dabuf *leaf2_bp);
+int     xfs_attr_leaf_newentsize(struct xfs_da_args *args, int blocksize,
+                                        int *local);
+int     xfs_attr_leaf_entsize(struct xfs_attr_leafblock *leaf, int index);
+int     xfs_attr_put_listent(struct xfs_attr_list_context *context,
+                             struct attrnames *, char *name, int namelen,
+                             int valuelen);
+int     xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
+#endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
new file mode 100644
index 000000000000..ef7d2942d306
--- /dev/null
+++ b/fs/xfs/xfs_attr_sf.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define __XFS_ATTR_SF_H__
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+struct xfs_inode;
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+        struct xfs_attr_sf_hdr {        /* constant-structure header block */
+                __uint16_t totsize;     /* total bytes in shortform list */
+                __uint8_t count;        /* count of active entries */
+        } hdr;
+        struct xfs_attr_sf_entry {
+                __uint8_t namelen;      /* actual length of name (no NULL) */
+                __uint8_t valuelen;     /* actual length of value (no NULL) */
+                __uint8_t flags;        /* flags bits (see xfs_attr_leaf.h) */
+                __uint8_t nameval[1];   /* name & value bytes concatenated */
+        } list[1];                      /* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+        __uint8_t       entno;          /* entry number in original list */
+        __uint8_t       namelen;        /* length of name value (no null) */
+        __uint8_t       valuelen;       /* length of value */
+        __uint8_t       flags;          /* flags bits (see xfs_attr_leaf.h) */
+        xfs_dahash_t    hash;           /* this entry's hash value */
+        char            *name;          /* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
+int xfs_attr_sf_entsize_byname(int nlen, int vlen);
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)   \
+        xfs_attr_sf_entsize_byname(nlen,vlen)
+#else
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)   /* space name/value uses */ \
+        ((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))
+#endif
+#define XFS_ATTR_SF_ENTSIZE_MAX                 /* max space for name&value */ \
+        ((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE)
+int xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep);
+#define XFS_ATTR_SF_ENTSIZE(sfep)       xfs_attr_sf_entsize(sfep)
+#else
+#define XFS_ATTR_SF_ENTSIZE(sfep)               /* space an entry uses */ \
+        ((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_NEXTENTRY)
+xfs_attr_sf_entry_t *xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep);
+#define XFS_ATTR_SF_NEXTENTRY(sfep)     xfs_attr_sf_nextentry(sfep)
+#else
+#define XFS_ATTR_SF_NEXTENTRY(sfep)             /* next entry in struct */ \
+        ((xfs_attr_sf_entry_t *) \
+                ((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_TOTSIZE)
+int xfs_attr_sf_totsize(struct xfs_inode *dp);
+#define XFS_ATTR_SF_TOTSIZE(dp)         xfs_attr_sf_totsize(dp)
+#else
+#define XFS_ATTR_SF_TOTSIZE(dp)                 /* total space in use */ \
+        (INT_GET(((xfs_attr_shortform_t *)((dp)->i_afp->if_u1.if_data))->hdr.totsize, ARCH_CONVERT))
+#endif
+#if defined(XFS_ATTR_TRACE)
+/*
+ * Kernel tracing support for attribute lists
+ */
+struct xfs_attr_list_context;
+struct xfs_da_intnode;
+struct xfs_da_node_entry;
+struct xfs_attr_leafblock;
+#define XFS_ATTR_TRACE_SIZE     4096    /* size of global trace buffer */
+extern ktrace_t *xfs_attr_trace_buf;
+/*
+ * Trace record types.
+ */
+#define XFS_ATTR_KTRACE_L_C     1       /* context */
+#define XFS_ATTR_KTRACE_L_CN    2       /* context, node */
+#define XFS_ATTR_KTRACE_L_CB    3       /* context, btree */
+#define XFS_ATTR_KTRACE_L_CL    4       /* context, leaf */
+void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
+void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
+                              struct xfs_da_intnode *node);
+void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
+                              struct xfs_da_node_entry *btree);
+void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
+                              struct xfs_attr_leafblock *leaf);
+void xfs_attr_trace_enter(int type, char *where,
+                             __psunsigned_t a2, __psunsigned_t a3,
+                             __psunsigned_t a4, __psunsigned_t a5,
+                             __psunsigned_t a6, __psunsigned_t a7,
+                             __psunsigned_t a8, __psunsigned_t a9,
+                             __psunsigned_t a10, __psunsigned_t a11,
+                             __psunsigned_t a12, __psunsigned_t a13,
+                             __psunsigned_t a14, __psunsigned_t a15);
+#else
+#define xfs_attr_trace_l_c(w,c)
+#define xfs_attr_trace_l_cn(w,c,n)
+#define xfs_attr_trace_l_cb(w,c,b)
+#define xfs_attr_trace_l_cl(w,c,l)
+#endif /* XFS_ATTR_TRACE */
+#endif  /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c
new file mode 100644
index 000000000000..16088e175ecc
--- /dev/null
+++ b/fs/xfs/xfs_behavior.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ *
+ */
+#include "xfs.h"
+/*
+ * Source file used to associate/disassociate behaviors with virtualized
+ * objects.  See xfs_behavior.h for more information about behaviors, etc.
+ *
+ * The implementation is split between functions in this file and macros
+ * in xfs_behavior.h.
+ */
+/*
+ * Insert a new behavior descriptor into a behavior chain.
+ *
+ * The behavior chain is ordered based on the 'position' number which
+ * lives in the first field of the ops vector (higher numbers first).
+ *
+ * Attemps to insert duplicate ops result in an EINVAL return code.
+ * Otherwise, return 0 to indicate success.
+ */
+int
+bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+        bhv_desc_t      *curdesc, *prev;
+        int             position;
+        /*
+         * Validate the position value of the new behavior.
+         */
+        position = BHV_POSITION(bdp);
+        ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP);
+        /*
+         * Find location to insert behavior.  Check for duplicates.
+         */
+        prev = NULL;
+        for (curdesc = bhp->bh_first;
+             curdesc != NULL;
+             curdesc = curdesc->bd_next) {
+                /* Check for duplication. */
+                if (curdesc->bd_ops == bdp->bd_ops) {
+                        ASSERT(0);
+                        return EINVAL;
+                }
+                /* Find correct position */
+                if (position >= BHV_POSITION(curdesc)) {
+                        ASSERT(position != BHV_POSITION(curdesc));
+                        break;          /* found it */
+                }
+                prev = curdesc;
+        }
+        if (prev == NULL) {
+                /* insert at front of chain */
+                bdp->bd_next = bhp->bh_first;
+                bhp->bh_first = bdp;
+        } else {
+                /* insert after prev */
+                bdp->bd_next = prev->bd_next;
+                prev->bd_next = bdp;
+        }
+        return 0;
+}
+/*
+ * Remove a behavior descriptor from a position in a behavior chain;
+ * the postition is guaranteed not to be the first position.
+ * Should only be called by the bhv_remove() macro.
+ */
+void
+bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+        bhv_desc_t      *curdesc, *prev;
+        ASSERT(bhp->bh_first != NULL);
+        ASSERT(bhp->bh_first->bd_next != NULL);
+        prev = bhp->bh_first;
+        for (curdesc = bhp->bh_first->bd_next;
+             curdesc != NULL;
+             curdesc = curdesc->bd_next) {
+                if (curdesc == bdp)
+                        break;          /* found it */
+                prev = curdesc;
+        }
+        ASSERT(curdesc == bdp);
+        prev->bd_next = bdp->bd_next;   /* remove from after prev */
+}
+/*
+ * Look for a specific ops vector on the specified behavior chain.
+ * Return the associated behavior descriptor.  Or NULL, if not found.
+ */
+bhv_desc_t *
+bhv_lookup(bhv_head_t *bhp, void *ops)
+{
+        bhv_desc_t      *curdesc;
+        for (curdesc = bhp->bh_first;
+             curdesc != NULL;
+             curdesc = curdesc->bd_next) {
+                if (curdesc->bd_ops == ops)
+                        return curdesc;
+        }
+        return NULL;
+}
+/*
+ * Looks for the first behavior within a specified range of positions.
+ * Return the associated behavior descriptor.  Or NULL, if none found.
+ */
+bhv_desc_t *
+bhv_lookup_range(bhv_head_t *bhp, int low, int high)
+{
+        bhv_desc_t      *curdesc;
+        for (curdesc = bhp->bh_first;
+             curdesc != NULL;
+             curdesc = curdesc->bd_next) {
+                int     position = BHV_POSITION(curdesc);
+                if (position <= high) {
+                        if (position >= low)
+                                return curdesc;
+                        return NULL;
+                }
+        }
+        return NULL;
+}
+/*
+ * Return the base behavior in the chain, or NULL if the chain
+ * is empty.
+ *
+ * The caller has not read locked the behavior chain, so acquire the
+ * lock before traversing the chain.
+ */
+bhv_desc_t *
+bhv_base(bhv_head_t *bhp)
+{
+        bhv_desc_t      *curdesc;
+        for (curdesc = bhp->bh_first;
+             curdesc != NULL;
+             curdesc = curdesc->bd_next) {
+                if (curdesc->bd_next == NULL) {
+                        return curdesc;
+                }
+        }
+        return NULL;
+}
+void
+bhv_head_init(
+        bhv_head_t *bhp,
+        char *name)
+{
+        bhp->bh_first = NULL;
+}
+void
+bhv_insert_initial(
+        bhv_head_t *bhp,
+        bhv_desc_t *bdp)
+{
+        ASSERT(bhp->bh_first == NULL);
+        (bhp)->bh_first = bdp;
+}
+void
+bhv_head_destroy(
+        bhv_head_t *bhp)
+{
+        ASSERT(bhp->bh_first == NULL);
+}
diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
new file mode 100644
index 000000000000..d5ed5a843921
--- /dev/null
+++ b/fs/xfs/xfs_behavior.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BEHAVIOR_H__
+#define __XFS_BEHAVIOR_H__
+/*
+ * Header file used to associate behaviors with virtualized objects.
+ *
+ * A virtualized object is an internal, virtualized representation of
+ * OS entities such as persistent files, processes, or sockets.  Examples
+ * of virtualized objects include vnodes, vprocs, and vsockets.  Often
+ * a virtualized object is referred to simply as an "object."
+ *
+ * A behavior is essentially an implementation layer associated with
+ * an object.  Multiple behaviors for an object are chained together,
+ * the order of chaining determining the order of invocation.  Each
+ * behavior of a given object implements the same set of interfaces
+ * (e.g., the VOP interfaces).
+ *
+ * Behaviors may be dynamically inserted into an object's behavior chain,
+ * such that the addition is transparent to consumers that already have
+ * references to the object.  Typically, a given behavior will be inserted
+ * at a particular location in the behavior chain.  Insertion of new
+ * behaviors is synchronized with operations-in-progress (oip's) so that
+ * the oip's always see a consistent view of the chain.
+ *
+ * The term "interpostion" is used to refer to the act of inserting
+ * a behavior such that it interposes on (i.e., is inserted in front
+ * of) a particular other behavior.  A key example of this is when a
+ * system implementing distributed single system image wishes to
+ * interpose a distribution layer (providing distributed coherency)
+ * in front of an object that is otherwise only accessed locally.
+ *
+ * Note that the traditional vnode/inode combination is simply a virtualized
+ * object that has exactly one associated behavior.
+ *
+ * Behavior synchronization is logic which is necessary under certain
+ * circumstances that there is no conflict between ongoing operations
+ * traversing the behavior chain and those dunamically modifying the
+ * behavior chain.  Because behavior synchronization adds extra overhead
+ * to virtual operation invocation, we want to restrict, as much as
+ * we can, the requirement for this extra code, to those situations
+ * in which it is truly necessary.
+ *
+ * Behavior synchronization is needed whenever there's at least one class
+ * of object in the system for which:
+ * 1) multiple behaviors for a given object are supported,
+ * -- AND --
+ * 2a) insertion of a new behavior can happen dynamically at any time during
+ *     the life of an active object,
+ *      -- AND --
+ *      3a) insertion of a new behavior needs to synchronize with existing
+ *          ops-in-progress.
+ *      -- OR --
+ *      3b) multiple different behaviors can be dynamically inserted at
+ *          any time during the life of an active object
+ *      -- OR --
+ *      3c) removal of a behavior can occur at any time during the life of
+ *          an active object.
+ * -- OR --
+ * 2b) removal of a behavior can occur at any time during the life of an
+ *     active object
+ *
+ */
+struct bhv_head_lock;
+/*
+ * Behavior head.  Head of the chain of behaviors.
+ * Contained within each virtualized object data structure.
+ */
+typedef struct bhv_head {
+        struct bhv_desc *bh_first;      /* first behavior in chain */
+        struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */
+} bhv_head_t;
+/*
+ * Behavior descriptor.  Descriptor associated with each behavior.
+ * Contained within the behavior's private data structure.
+ */
+typedef struct bhv_desc {
+        void            *bd_pdata;      /* private data for this behavior */
+        void            *bd_vobj;       /* virtual object associated with */
+        void            *bd_ops;        /* ops for this behavior */
+        struct bhv_desc *bd_next;       /* next behavior in chain */
+} bhv_desc_t;
+/*
+ * Behavior identity field.  A behavior's identity determines the position
+ * where it lives within a behavior chain, and it's always the first field
+ * of the behavior's ops vector. The optional id field further identifies the
+ * subsystem responsible for the behavior.
+ */
+typedef struct bhv_identity {
+        __u16   bi_id;          /* owning subsystem id */
+        __u16   bi_position;    /* position in chain */
+} bhv_identity_t;
+typedef bhv_identity_t bhv_position_t;
+#define BHV_IDENTITY_INIT(id,pos)       {id, pos}
+#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
+/*
+ * Define boundaries of position values.
+ */
+#define BHV_POSITION_INVALID    0       /* invalid position number */
+#define BHV_POSITION_BASE       1       /* base (last) implementation layer */
+#define BHV_POSITION_TOP        63      /* top (first) implementation layer */
+/*
+ * Plumbing macros.
+ */
+#define BHV_HEAD_FIRST(bhp)     (ASSERT((bhp)->bh_first), (bhp)->bh_first)
+#define BHV_NEXT(bdp)           (ASSERT((bdp)->bd_next), (bdp)->bd_next)
+#define BHV_NEXTNULL(bdp)       ((bdp)->bd_next)
+#define BHV_VOBJ(bdp)           (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
+#define BHV_VOBJNULL(bdp)       ((bdp)->bd_vobj)
+#define BHV_PDATA(bdp)          (bdp)->bd_pdata
+#define BHV_OPS(bdp)            (bdp)->bd_ops
+#define BHV_IDENTITY(bdp)       ((bhv_identity_t *)(bdp)->bd_ops)
+#define BHV_POSITION(bdp)       (BHV_IDENTITY(bdp)->bi_position)
+extern void bhv_head_init(bhv_head_t *, char *);
+extern void bhv_head_destroy(bhv_head_t *);
+extern int  bhv_insert(bhv_head_t *, bhv_desc_t *);
+extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
+/*
+ * Initialize a new behavior descriptor.
+ * Arguments:
+ *   bdp - pointer to behavior descriptor
+ *   pdata - pointer to behavior's private data
+ *   vobj - pointer to associated virtual object
+ *   ops - pointer to ops for this behavior
+ */
+#define bhv_desc_init(bdp, pdata, vobj, ops)            \
+ {                                                      \
+        (bdp)->bd_pdata = pdata;                        \
+        (bdp)->bd_vobj = vobj;                          \
+        (bdp)->bd_ops = ops;                            \
+        (bdp)->bd_next = NULL;                          \
+ }
+/*
+ * Remove a behavior descriptor from a behavior chain.
+ */
+#define bhv_remove(bhp, bdp)                            \
+ {                                                      \
+        if ((bhp)->bh_first == (bdp)) {                 \
+                /*                                      \
+                * Remove from front of chain.           \
+                * Atomic wrt oip's.                     \
+                */                                      \
+               (bhp)->bh_first = (bdp)->bd_next;        \
+        } else {                                        \
+               /* remove from non-front of chain */     \
+               bhv_remove_not_first(bhp, bdp);          \
+        }                                               \
+        (bdp)->bd_vobj = NULL;                          \
+ }
+/*
+ * Behavior module prototypes.
+ */
+extern void             bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
+extern bhv_desc_t *     bhv_lookup(bhv_head_t *bhp, void *ops);
+extern bhv_desc_t *     bhv_lookup_range(bhv_head_t *bhp, int low, int high);
+extern bhv_desc_t *     bhv_base(bhv_head_t *bhp);
+/* No bhv locking on Linux */
+#define bhv_lookup_unlocked     bhv_lookup
+#define bhv_base_unlocked       bhv_base
+#endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
new file mode 100644
index 000000000000..a20a6c3dc13e
--- /dev/null
+++ b/fs/xfs/xfs_bit.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#ifndef HAVE_ARCH_HIGHBIT
+/*
+ * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
+ */
+const char xfs_highbit[256] = {
+       -1, 0, 1, 1, 2, 2, 2, 2,                 /* 00 .. 07 */
+        3, 3, 3, 3, 3, 3, 3, 3,                 /* 08 .. 0f */
+        4, 4, 4, 4, 4, 4, 4, 4,                 /* 10 .. 17 */
+        4, 4, 4, 4, 4, 4, 4, 4,                 /* 18 .. 1f */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 20 .. 27 */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 28 .. 2f */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 30 .. 37 */
+        5, 5, 5, 5, 5, 5, 5, 5,                 /* 38 .. 3f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 40 .. 47 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 48 .. 4f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 50 .. 57 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 58 .. 5f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 60 .. 67 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 68 .. 6f */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 70 .. 77 */
+        6, 6, 6, 6, 6, 6, 6, 6,                 /* 78 .. 7f */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 80 .. 87 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 88 .. 8f */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 90 .. 97 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* 98 .. 9f */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* a0 .. a7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* a8 .. af */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* b0 .. b7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* b8 .. bf */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* c0 .. c7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* c8 .. cf */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* d0 .. d7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* d8 .. df */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* e0 .. e7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* e8 .. ef */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* f0 .. f7 */
+        7, 7, 7, 7, 7, 7, 7, 7,                 /* f8 .. ff */
+};
+#endif
+/*
+ * Count of bits set in byte, 0..8.
+ */
+static const char xfs_countbit[256] = {
+        0, 1, 1, 2, 1, 2, 2, 3,                 /* 00 .. 07 */
+        1, 2, 2, 3, 2, 3, 3, 4,                 /* 08 .. 0f */
+        1, 2, 2, 3, 2, 3, 3, 4,                 /* 10 .. 17 */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 18 .. 1f */
+        1, 2, 2, 3, 2, 3, 3, 4,                 /* 20 .. 27 */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 28 .. 2f */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 30 .. 37 */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* 38 .. 3f */
+        1, 2, 2, 3, 2, 3, 3, 4,                 /* 40 .. 47 */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 48 .. 4f */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 50 .. 57 */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* 58 .. 5f */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 60 .. 67 */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* 68 .. 6f */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* 70 .. 77 */
+        4, 5, 5, 6, 5, 6, 6, 7,                 /* 78 .. 7f */
+        1, 2, 2, 3, 2, 3, 3, 4,                 /* 80 .. 87 */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 88 .. 8f */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* 90 .. 97 */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* 98 .. 9f */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* a0 .. a7 */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* a8 .. af */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* b0 .. b7 */
+        4, 5, 5, 6, 5, 6, 6, 7,                 /* b8 .. bf */
+        2, 3, 3, 4, 3, 4, 4, 5,                 /* c0 .. c7 */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* c8 .. cf */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* d0 .. d7 */
+        4, 5, 5, 6, 5, 6, 6, 7,                 /* d8 .. df */
+        3, 4, 4, 5, 4, 5, 5, 6,                 /* e0 .. e7 */
+        4, 5, 5, 6, 5, 6, 6, 7,                 /* e8 .. ef */
+        4, 5, 5, 6, 5, 6, 6, 7,                 /* f0 .. f7 */
+        5, 6, 6, 7, 6, 7, 7, 8,                 /* f8 .. ff */
+};
+/*
+ * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
+ */
+inline int
+xfs_highbit32(
+        __uint32_t      v)
+{
+#ifdef HAVE_ARCH_HIGHBIT
+        return highbit32(v);
+#else
+        int             i;
+        if (v & 0xffff0000)
+                if (v & 0xff000000)
+                        i = 24;
+                else
+                        i = 16;
+        else if (v & 0x0000ffff)
+                if (v & 0x0000ff00)
+                        i = 8;
+                else
+                        i = 0;
+        else
+                return -1;
+        return i + xfs_highbit[(v >> i) & 0xff];
+#endif
+}
+/*
+ * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_lowbit64(
+        __uint64_t      v)
+{
+        __uint32_t      w = (__uint32_t)v;
+        int             n = 0;
+        if (w) {        /* lower bits */
+                n = ffs(w);
+        } else {        /* upper bits */
+                w = (__uint32_t)(v >> 32);
+                if (w && (n = ffs(w)))
+                        n += 32;
+        }
+        return n - 1;
+}
+/*
+ * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_highbit64(
+        __uint64_t      v)
+{
+        __uint32_t      h = (__uint32_t)(v >> 32);
+        if (h)
+                return xfs_highbit32(h) + 32;
+        return xfs_highbit32((__uint32_t)v);
+}
+/*
+ * Count the number of bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ *
+ * Do the counting by mapping a byte value to the number of set
+ * bits for that value using the xfs_countbit array, i.e.
+ * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1,
+ * xfs_countbit[3] == 2, etc.
+ */
+int
+xfs_count_bits(uint *map, uint size, uint start_bit)
+{
+        register int    bits;
+        register unsigned char  *bytep;
+        register unsigned char  *end_map;
+        int             byte_bit;
+        bits = 0;
+        end_map = (char*)(map + size);
+        bytep = (char*)(map + (start_bit & ~0x7));
+        byte_bit = start_bit & 0x7;
+        /*
+         * If the caller fell off the end of the map, return 0.
+         */
+        if (bytep >= end_map) {
+                return (0);
+        }
+        /*
+         * If start_bit is not byte aligned, then process the
+         * first byte separately.
+         */
+        if (byte_bit != 0) {
+                /*
+                 * Shift off the bits we don't want to look at,
+                 * before indexing into xfs_countbit.
+                 */
+                bits += xfs_countbit[(*bytep >> byte_bit)];
+                bytep++;
+        }
+        /*
+         * Count the bits in each byte until the end of the bitmap.
+         */
+        while (bytep < end_map) {
+                bits += xfs_countbit[*bytep];
+                bytep++;
+        }
+        return (bits);
+}
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint size, uint start_bit)
+{
+        uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+        uint result = 0;
+        uint tmp;
+        size <<= BIT_TO_WORD_SHIFT;
+        ASSERT(start_bit < size);
+        size -= start_bit & ~(NBWORD - 1);
+        start_bit &= (NBWORD - 1);
+        if (start_bit) {
+                tmp = *p++;
+                /* set to one first offset bits prior to start */
+                tmp |= (~0U >> (NBWORD-start_bit));
+                if (tmp != ~0U)
+                        goto found;
+                result += NBWORD;
+                size -= NBWORD;
+        }
+        while (size) {
+                if ((tmp = *p++) != ~0U)
+                        goto found;
+                result += NBWORD;
+                size -= NBWORD;
+        }
+        return result - start_bit;
+found:
+        return result + ffz(tmp) - start_bit;
+}
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there.  It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+        uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+        uint result = start_bit & ~(NBWORD - 1);
+        uint tmp;
+        size <<= BIT_TO_WORD_SHIFT;
+        if (start_bit >= size)
+                return -1;
+        size -= result;
+        start_bit &= (NBWORD - 1);
+        if (start_bit) {
+                tmp = *p++;
+                /* set to zero first offset bits prior to start */
+                tmp &= (~0U << start_bit);
+                if (tmp != 0U)
+                        goto found;
+                result += NBWORD;
+                size -= NBWORD;
+        }
+        while (size) {
+                if ((tmp = *p++) != 0U)
+                        goto found;
+                result += NBWORD;
+                size -= NBWORD;
+        }
+        return -1;
+found:
+        return result + ffs(tmp) - 1;
+}
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
new file mode 100644
index 000000000000..1e7f57ddf7a8
--- /dev/null
+++ b/fs/xfs/xfs_bit.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BIT_H__
+#define __XFS_BIT_H__
+/*
+ * XFS bit manipulation routines.
+ */
+/*
+ * masks with n high/low bits set, 32-bit values & 64-bit values
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32HI)
+__uint32_t xfs_mask32hi(int n);
+#define XFS_MASK32HI(n)         xfs_mask32hi(n)
+#else
+#define XFS_MASK32HI(n)         ((__uint32_t)-1 << (32 - (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64HI)
+__uint64_t xfs_mask64hi(int n);
+#define XFS_MASK64HI(n)         xfs_mask64hi(n)
+#else
+#define XFS_MASK64HI(n)         ((__uint64_t)-1 << (64 - (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32LO)
+__uint32_t xfs_mask32lo(int n);
+#define XFS_MASK32LO(n)         xfs_mask32lo(n)
+#else
+#define XFS_MASK32LO(n)         (((__uint32_t)1 << (n)) - 1)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64LO)
+__uint64_t xfs_mask64lo(int n);
+#define XFS_MASK64LO(n)         xfs_mask64lo(n)
+#else
+#define XFS_MASK64LO(n)         (((__uint64_t)1 << (n)) - 1)
+#endif
+/* Get high bit set out of 32-bit argument, -1 if none set */
+extern int xfs_highbit32(__uint32_t v);
+/* Get low bit set out of 64-bit argument, -1 if none set */
+extern int xfs_lowbit64(__uint64_t v);
+/* Get high bit set out of 64-bit argument, -1 if none set */
+extern int xfs_highbit64(__uint64_t);
+/* Count set bits in map starting with start_bit */
+extern int xfs_count_bits(uint *map, uint size, uint start_bit);
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+#endif  /* __XFS_BIT_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
new file mode 100644
index 000000000000..de3162418663
--- /dev/null
+++ b/fs/xfs/xfs_bmap.c
@@ -0,0 +1,6246 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_bit.h"
+#include "xfs_rw.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_buf_item.h"
+#ifdef DEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
+#endif
+kmem_zone_t             *xfs_bmap_free_item_zone;
+/*
+ * Prototypes for internal bmap routines.
+ */
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_extents(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags);        /* inode logging flags */
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_local(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags);        /* inode logging flags */
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        xfs_fsblock_t           *first, /* pointer to firstblock variable */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork, /* data or attr fork */
+        int                     rsvd);  /* OK to allocate reserved blocks */
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_delay_real(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
+        xfs_fsblock_t           *first, /* pointer to firstblock variable */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        int                     *logflagsp, /* inode logging flags */
+        int                     rsvd);  /* OK to allocate reserved blocks */
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_hole_delay(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         *cur,   /* if null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        int                     *logflagsp,/* inode logging flags */
+        int                     rsvd);  /* OK to allocate reserved blocks */
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_hole_real(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         *cur,   /* if null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork); /* data or attr fork */
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_unwritten_real(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        int                     *logflagsp); /* inode logging flags */
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int                              /* error */
+xfs_bmap_alloc(
+        xfs_bmalloca_t          *ap);   /* bmap alloc argument struct */
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the extent list is already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int                              /* error */
+xfs_bmap_btree_to_extents(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork); /* data or attr fork */
+#ifdef DEBUG
+/*
+ * Check that the extents list for the inode ip is in the right order.
+ */
+STATIC void
+xfs_bmap_check_extents(
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        int                     whichfork);     /* data or attr fork */
+#endif
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int                              /* error */
+xfs_bmap_del_extent(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_trans_t             *tp,    /* current trans pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        xfs_btree_cur_t         *cur,   /* if null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        int                     *logflagsp,/* inode logging flags */
+        int                     whichfork, /* data or attr fork */
+        int                     rsvd);   /* OK to allocate reserved blocks */
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+        xfs_bmap_free_t         *flist, /* free item list header */
+        xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
+        xfs_bmap_free_item_t    *free); /* list item to be freed */
+/*
+ * Remove count entries from the extents array for inode "ip", starting
+ * at index "idx".  Copies the remaining items down over the deleted ones,
+ * and gives back the excess memory.
+ */
+STATIC void
+xfs_bmap_delete_exlist(
+        xfs_inode_t     *ip,            /* incode inode pointer */
+        xfs_extnum_t    idx,            /* starting delete index */
+        xfs_extnum_t    count,          /* count of items to delete */
+        int             whichfork);     /* data or attr fork */
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int                                      /* error */
+xfs_bmap_extents_to_btree(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first-block-allocated */
+        xfs_bmap_free_t         *flist,         /* blocks freed in xaction */
+        xfs_btree_cur_t         **curp,         /* cursor returned to caller */
+        int                     wasdel,         /* converting a delayed alloc */
+        int                     *logflagsp,     /* inode logging flags */
+        int                     whichfork);     /* data or attr fork */
+/*
+ * Insert new item(s) in the extent list for inode "ip".
+ * Count new items are inserted at offset idx.
+ */
+STATIC void
+xfs_bmap_insert_exlist(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* starting index of new items */
+        xfs_extnum_t    count,          /* number of inserted items */
+        xfs_bmbt_irec_t *new,           /* items to insert */
+        int             whichfork);     /* data or attr fork */
+/*
+ * Convert a local file to an extents file.
+ * This code is sort of bogus, since the file data needs to get
+ * logged so it won't be lost.  The bmap-level manipulations are ok, though.
+ */
+STATIC int                              /* error */
+xfs_bmap_local_to_extents(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fsblock_t   *firstblock,    /* first block allocated in xaction */
+        xfs_extlen_t    total,          /* total blocks needed by transaction */
+        int             *logflagsp,     /* inode logging flags */
+        int             whichfork);     /* data or attr fork */
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fileoff_t   bno,            /* block number searched for */
+        int             whichfork,      /* data or attr fork */
+        int             *eofp,          /* out: end of file found */
+        xfs_extnum_t    *lastxp,        /* out: last extent index */
+        xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+        xfs_bmbt_irec_t *prevp);        /* out: previous extent entry found */
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add a bmap trace buffer entry.  Base routine for the others.
+ */
+STATIC void
+xfs_bmap_trace_addentry(
+        int             opcode,         /* operation */
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry(ies) */
+        xfs_extnum_t    cnt,            /* count of entries, 1 or 2 */
+        xfs_bmbt_rec_t  *r1,            /* first record */
+        xfs_bmbt_rec_t  *r2,            /* second record or null */
+        int             whichfork);     /* data or attr fork */
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
+ */
+STATIC void
+xfs_bmap_trace_delete(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry(entries) deleted */
+        xfs_extnum_t    cnt,            /* count of entries deleted, 1 or 2 */
+        int             whichfork);     /* data or attr fork */
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
+ * reading in the extents list from the disk (in the btree).
+ */
+STATIC void
+xfs_bmap_trace_insert(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry(entries) inserted */
+        xfs_extnum_t    cnt,            /* count of entries inserted, 1 or 2 */
+        xfs_bmbt_irec_t *r1,            /* inserted record 1 */
+        xfs_bmbt_irec_t *r2,            /* inserted record 2 or null */
+        int             whichfork);     /* data or attr fork */
+/*
+ * Add bmap trace entry after updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_post_update(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry updated */
+        int             whichfork);     /* data or attr fork */
+/*
+ * Add bmap trace entry prior to updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_pre_update(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry to be updated */
+        int             whichfork);     /* data or attr fork */
+#else
+#define xfs_bmap_trace_delete(f,d,ip,i,c,w)
+#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w)
+#define xfs_bmap_trace_post_update(f,d,ip,i,w)
+#define xfs_bmap_trace_pre_update(f,d,ip,i,w)
+#endif  /* XFS_BMAP_TRACE */
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_filblks_t           len);   /* delayed extent length */
+#ifdef DEBUG
+/*
+ * Perform various validation checks on the values being returned
+ * from xfs_bmapi().
+ */
+STATIC void
+xfs_bmap_validate_ret(
+        xfs_fileoff_t           bno,
+        xfs_filblks_t           len,
+        int                     flags,
+        xfs_bmbt_irec_t         *mval,
+        int                     nmap,
+        int                     ret_nmap);
+#else
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+#if defined(XFS_RW_TRACE)
+STATIC void
+xfs_bunmap_trace(
+        xfs_inode_t             *ip,
+        xfs_fileoff_t           bno,
+        xfs_filblks_t           len,
+        int                     flags,
+        inst_t                  *ra);
+#else
+#define xfs_bunmap_trace(ip, bno, len, flags, ra)
+#endif  /* XFS_RW_TRACE */
+STATIC int
+xfs_bmap_count_tree(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_fsblock_t   blockno,
+        int             levelin,
+        int             *count);
+STATIC int
+xfs_bmap_count_leaves(
+        xfs_bmbt_rec_t          *frp,
+        int                     numrecs,
+        int                     *count);
+/*
+ * Bmap internal routines.
+ */
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_btree(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags)         /* inode logging flags */
+{
+        xfs_btree_cur_t         *cur;           /* btree cursor */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* file system mount struct */
+        int                     stat;           /* newroot status */
+        mp = ip->i_mount;
+        if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+                *flags |= XFS_ILOG_DBROOT;
+        else {
+                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                        XFS_DATA_FORK);
+                cur->bc_private.b.flist = flist;
+                cur->bc_private.b.firstblock = *firstblock;
+                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+                        goto error0;
+                ASSERT(stat == 1);      /* must be at least one entry */
+                if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+                        goto error0;
+                if (stat == 0) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                        return XFS_ERROR(ENOSPC);
+                }
+                *firstblock = cur->bc_private.b.firstblock;
+                cur->bc_private.b.allocated = 0;
+                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        }
+        return 0;
+error0:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_extents(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags)         /* inode logging flags */
+{
+        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+        int                     error;          /* error return value */
+        if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+                return 0;
+        cur = NULL;
+        error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+                flags, XFS_DATA_FORK);
+        if (cur) {
+                cur->bc_private.b.allocated = 0;
+                xfs_btree_del_cursor(cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        return error;
+}
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int                                      /* error */
+xfs_bmap_add_attrfork_local(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first block allocated */
+        xfs_bmap_free_t         *flist,         /* blocks to free at commit */
+        int                     *flags)         /* inode logging flags */
+{
+        xfs_da_args_t           dargs;          /* args for dir/attr code */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* mount structure pointer */
+        if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+                return 0;
+        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+                mp = ip->i_mount;
+                memset(&dargs, 0, sizeof(dargs));
+                dargs.dp = ip;
+                dargs.firstblock = firstblock;
+                dargs.flist = flist;
+                dargs.total = mp->m_dirblkfsbs;
+                dargs.whichfork = XFS_DATA_FORK;
+                dargs.trans = tp;
+                error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
+        } else
+                error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+                        XFS_DATA_FORK);
+        return error;
+}
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        xfs_fsblock_t           *first, /* pointer to firstblock variable */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork, /* data or attr fork */
+        int                     rsvd)   /* OK to use reserved data blocks */
+{
+        xfs_btree_cur_t         *cur;   /* btree cursor or null */
+        xfs_filblks_t           da_new; /* new count del alloc blocks used */
+        xfs_filblks_t           da_old; /* old count del alloc blocks used */
+        int                     error;  /* error return value */
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_bmap_add_extent";
+#endif
+        xfs_ifork_t             *ifp;   /* inode fork ptr */
+        int                     logflags; /* returned value */
+        xfs_extnum_t            nextents; /* number of extents in file now */
+        XFS_STATS_INC(xs_add_exlist);
+        cur = *curp;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        ASSERT(idx <= nextents);
+        da_old = da_new = 0;
+        error = 0;
+        /*
+         * This is the first extent added to a new/empty file.
+         * Special case this one, so other routines get to assume there are
+         * already extents in the list.
+         */
+        if (nextents == 0) {
+                xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new,
+                        NULL, whichfork);
+                xfs_bmap_insert_exlist(ip, 0, 1, new, whichfork);
+                ASSERT(cur == NULL);
+                ifp->if_lastex = 0;
+                if (!ISNULLSTARTBLOCK(new->br_startblock)) {
+                        XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+                        logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                } else
+                        logflags = 0;
+        }
+        /*
+         * Any kind of new delayed allocation goes here.
+         */
+        else if (ISNULLSTARTBLOCK(new->br_startblock)) {
+                if (cur)
+                        ASSERT((cur->bc_private.b.flags &
+                                XFS_BTCUR_BPRV_WASDEL) == 0);
+                if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
+                                &logflags, rsvd)))
+                        goto done;
+        }
+        /*
+         * Real allocation off the end of the file.
+         */
+        else if (idx == nextents) {
+                if (cur)
+                        ASSERT((cur->bc_private.b.flags &
+                                XFS_BTCUR_BPRV_WASDEL) == 0);
+                if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+                                &logflags, whichfork)))
+                        goto done;
+        } else {
+                xfs_bmbt_irec_t prev;   /* old extent at offset idx */
+                /*
+                 * Get the record referred to by idx.
+                 */
+                xfs_bmbt_get_all(&ifp->if_u1.if_extents[idx], &prev);
+                /*
+                 * If it's a real allocation record, and the new allocation ends
+                 * after the start of the referred to record, then we're filling
+                 * in a delayed or unwritten allocation with a real one, or
+                 * converting real back to unwritten.
+                 */
+                if (!ISNULLSTARTBLOCK(new->br_startblock) &&
+                    new->br_startoff + new->br_blockcount > prev.br_startoff) {
+                        if (prev.br_state != XFS_EXT_UNWRITTEN &&
+                            ISNULLSTARTBLOCK(prev.br_startblock)) {
+                                da_old = STARTBLOCKVAL(prev.br_startblock);
+                                if (cur)
+                                        ASSERT(cur->bc_private.b.flags &
+                                                XFS_BTCUR_BPRV_WASDEL);
+                                if ((error = xfs_bmap_add_extent_delay_real(ip,
+                                        idx, &cur, new, &da_new, first, flist,
+                                        &logflags, rsvd)))
+                                        goto done;
+                        } else if (new->br_state == XFS_EXT_NORM) {
+                                ASSERT(new->br_state == XFS_EXT_NORM);
+                                if ((error = xfs_bmap_add_extent_unwritten_real(
+                                        ip, idx, &cur, new, &logflags)))
+                                        goto done;
+                        } else {
+                                ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
+                                if ((error = xfs_bmap_add_extent_unwritten_real(
+                                        ip, idx, &cur, new, &logflags)))
+                                        goto done;
+                        }
+                        ASSERT(*curp == cur || *curp == NULL);
+                }
+                /*
+                 * Otherwise we're filling in a hole with an allocation.
+                 */
+                else {
+                        if (cur)
+                                ASSERT((cur->bc_private.b.flags &
+                                        XFS_BTCUR_BPRV_WASDEL) == 0);
+                        if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+                                        new, &logflags, whichfork)))
+                                goto done;
+                }
+        }
+        ASSERT(*curp == cur || *curp == NULL);
+        /*
+         * Convert to a btree if necessary.
+         */
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+                int     tmp_logflags;   /* partial log flag return val */
+                ASSERT(cur == NULL);
+                error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first,
+                        flist, &cur, da_old > 0, &tmp_logflags, whichfork);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto done;
+        }
+        /*
+         * Adjust for changes in reserved delayed indirect blocks.
+         * Nothing to do for disk quotas here.
+         */
+        if (da_old || da_new) {
+                xfs_filblks_t   nblks;
+                nblks = da_new;
+                if (cur)
+                        nblks += cur->bc_private.b.allocated;
+                ASSERT(nblks <= da_old);
+                if (nblks < da_old)
+                        xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                                (int)(da_old - nblks), rsvd);
+        }
+        /*
+         * Clear out the allocated field, done with it now in any case.
+         */
+        if (cur) {
+                cur->bc_private.b.allocated = 0;
+                *curp = cur;
+        }
+done:
+#ifdef DEBUG
+        if (!error)
+                xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+#endif
+        *logflagsp = logflags;
+        return error;
+}
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_delay_real(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        xfs_filblks_t           *dnew,  /* new delayed-alloc indirect blocks */
+        xfs_fsblock_t           *first, /* pointer to firstblock variable */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        int                     *logflagsp, /* inode logging flags */
+        int                     rsvd)   /* OK to use reserved data block allocation */
+{
+        xfs_bmbt_rec_t          *base;  /* base of extent entry list */
+        xfs_btree_cur_t         *cur;   /* btree cursor */
+        int                     diff;   /* temp value */
+        xfs_bmbt_rec_t          *ep;    /* extent entry for idx */
+        int                     error;  /* error return value */
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_bmap_add_extent_delay_real";
+#endif
+        int                     i;      /* temp state */
+        xfs_fileoff_t           new_endoff;     /* end offset of new entry */
+        xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
+                                        /* left is 0, right is 1, prev is 2 */
+        int                     rval=0; /* return value (logging flags) */
+        int                     state = 0;/* state bits, accessed thru macros */
+        xfs_filblks_t           temp;   /* value for dnew calculations */
+        xfs_filblks_t           temp2;  /* value for dnew calculations */
+        int                     tmp_rval;       /* partial logging flags */
+        enum {                          /* bit number definitions for state */
+                LEFT_CONTIG,    RIGHT_CONTIG,
+                LEFT_FILLING,   RIGHT_FILLING,
+                LEFT_DELAY,     RIGHT_DELAY,
+                LEFT_VALID,     RIGHT_VALID
+        };
+#define LEFT            r[0]
+#define RIGHT           r[1]
+#define PREV            r[2]
+#define MASK(b)         (1 << (b))
+#define MASK2(a,b)      (MASK(a) | MASK(b))
+#define MASK3(a,b,c)    (MASK2(a,b) | MASK(c))
+#define MASK4(a,b,c,d)  (MASK3(a,b,c) | MASK(d))
+#define STATE_SET(b,v)  ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)   (state & MASK(b))
+#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
+                                       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE            \
+        (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
+        /*
+         * Set up a bunch of variables to make the tests simpler.
+         */
+        cur = *curp;
+        base = ip->i_df.if_u1.if_extents;
+        ep = &base[idx];
+        xfs_bmbt_get_all(ep, &PREV);
+        new_endoff = new->br_startoff + new->br_blockcount;
+        ASSERT(PREV.br_startoff <= new->br_startoff);
+        ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+        /*
+         * Set flags determining what part of the previous delayed allocation
+         * extent is being replaced by a real allocation.
+         */
+        STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+        STATE_SET(RIGHT_FILLING,
+                PREV.br_startoff + PREV.br_blockcount == new_endoff);
+        /*
+         * Check and set flags if this segment has a left neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         */
+        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+                xfs_bmbt_get_all(ep - 1, &LEFT);
+                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+        }
+        STATE_SET(LEFT_CONTIG,
+                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+                LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+                LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+                LEFT.br_state == new->br_state &&
+                LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+        /*
+         * Check and set flags if this segment has a right neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         * Also check for all-three-contiguous being too large.
+         */
+        if (STATE_SET_TEST(RIGHT_VALID,
+                        idx <
+                        ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+                xfs_bmbt_get_all(ep + 1, &RIGHT);
+                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+        }
+        STATE_SET(RIGHT_CONTIG,
+                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+                new_endoff == RIGHT.br_startoff &&
+                new->br_startblock + new->br_blockcount ==
+                    RIGHT.br_startblock &&
+                new->br_state == RIGHT.br_state &&
+                new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+                ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+                  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+                 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                     <= MAXEXTLEN));
+        error = 0;
+        /*
+         * Switch out based on the FILLING and CONTIG state bits.
+         */
+        switch (SWITCH_STATE) {
+        case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * The left and right neighbors are both contiguous with new.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        LEFT.br_blockcount + PREV.br_blockcount +
+                        RIGHT.br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
+                        XFS_DATA_FORK);
+                xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx - 1;
+                ip->i_d.di_nextents--;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_delete(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                        LEFT.br_startblock,
+                                        LEFT.br_blockcount +
+                                        PREV.br_blockcount +
+                                        RIGHT.br_blockcount, LEFT.br_state)))
+                                goto done;
+                }
+                *dnew = 0;
+                break;
+        case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * The left neighbor is contiguous, the right is not.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        LEFT.br_blockcount + PREV.br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx - 1;
+                xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+                                        LEFT.br_startblock, LEFT.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                        LEFT.br_startblock,
+                                        LEFT.br_blockcount +
+                                        PREV.br_blockcount, LEFT.br_state)))
+                                goto done;
+                }
+                *dnew = 0;
+                break;
+        case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * The right neighbor is contiguous, the left is not.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_startblock(ep, new->br_startblock);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount + RIGHT.br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                                        new->br_startblock,
+                                        PREV.br_blockcount +
+                                        RIGHT.br_blockcount, PREV.br_state)))
+                                goto done;
+                }
+                *dnew = 0;
+                break;
+        case MASK2(LEFT_FILLING, RIGHT_FILLING):
+                /*
+                 * Filling in all of a previously delayed allocation extent.
+                 * Neither the left nor right neighbors are contiguous with
+                 * the new one.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_startblock(ep, new->br_startblock);
+                xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 0);
+                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                *dnew = 0;
+                break;
+        case MASK2(LEFT_FILLING, LEFT_CONTIG):
+                /*
+                 * Filling in the first part of a previous delayed allocation.
+                 * The left neighbor is contiguous.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        LEFT.br_blockcount + new->br_blockcount);
+                xfs_bmbt_set_startoff(ep,
+                        PREV.br_startoff + new->br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                temp = PREV.br_blockcount - new->br_blockcount;
+                xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep, temp);
+                ip->i_df.if_lastex = idx - 1;
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+                                        LEFT.br_startblock, LEFT.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                        LEFT.br_startblock,
+                                        LEFT.br_blockcount +
+                                        new->br_blockcount,
+                                        LEFT.br_state)))
+                                goto done;
+                }
+                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                        STARTBLOCKVAL(PREV.br_startblock));
+                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
+                        XFS_DATA_FORK);
+                *dnew = temp;
+                break;
+        case MASK(LEFT_FILLING):
+                /*
+                 * Filling in the first part of a previous delayed allocation.
+                 * The left neighbor is not contiguous.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+                xfs_bmbt_set_startoff(ep, new_endoff);
+                temp = PREV.br_blockcount - new->br_blockcount;
+                xfs_bmbt_set_blockcount(ep, temp);
+                xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
+                        XFS_DATA_FORK);
+                xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 0);
+                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+                        error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+                                        first, flist, &cur, 1, &tmp_rval,
+                                        XFS_DATA_FORK);
+                        rval |= tmp_rval;
+                        if (error)
+                                goto done;
+                }
+                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                        STARTBLOCKVAL(PREV.br_startblock) -
+                        (cur ? cur->bc_private.b.allocated : 0));
+                base = ip->i_df.if_u1.if_extents;
+                ep = &base[idx + 1];
+                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
+                        XFS_DATA_FORK);
+                *dnew = temp;
+                break;
+        case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+                /*
+                 * Filling in the last part of a previous delayed allocation.
+                 * The right neighbor is contiguous with the new allocation.
+                 */
+                temp = PREV.br_blockcount - new->br_blockcount;
+                xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep, temp);
+                xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+                        new->br_blockcount + RIGHT.br_blockcount,
+                        RIGHT.br_state);
+                xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx + 1;
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                        new->br_startblock,
+                                        new->br_blockcount +
+                                        RIGHT.br_blockcount,
+                                        RIGHT.br_state)))
+                                goto done;
+                }
+                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                        STARTBLOCKVAL(PREV.br_startblock));
+                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                *dnew = temp;
+                break;
+        case MASK(RIGHT_FILLING):
+                /*
+                 * Filling in the last part of a previous delayed allocation.
+                 * The right neighbor is not contiguous.
+                 */
+                temp = PREV.br_blockcount - new->br_blockcount;
+                xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep, temp);
+                xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
+                        new, NULL, XFS_DATA_FORK);
+                xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx + 1;
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 0);
+                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+                        error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+                                first, flist, &cur, 1, &tmp_rval,
+                                XFS_DATA_FORK);
+                        rval |= tmp_rval;
+                        if (error)
+                                goto done;
+                }
+                temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                        STARTBLOCKVAL(PREV.br_startblock) -
+                        (cur ? cur->bc_private.b.allocated : 0));
+                base = ip->i_df.if_u1.if_extents;
+                ep = &base[idx];
+                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+                *dnew = temp;
+                break;
+        case 0:
+                /*
+                 * Filling in the middle part of a previous delayed allocation.
+                 * Contiguity is impossible here.
+                 * This case is avoided almost all the time.
+                 */
+                temp = new->br_startoff - PREV.br_startoff;
+                xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep, temp);
+                r[0] = *new;
+                r[1].br_startoff = new_endoff;
+                temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+                r[1].br_blockcount = temp2;
+                xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
+                        XFS_DATA_FORK);
+                xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx + 1;
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 0);
+                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+                    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+                        error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+                                        first, flist, &cur, 1, &tmp_rval,
+                                        XFS_DATA_FORK);
+                        rval |= tmp_rval;
+                        if (error)
+                                goto done;
+                }
+                temp = xfs_bmap_worst_indlen(ip, temp);
+                temp2 = xfs_bmap_worst_indlen(ip, temp2);
+                diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
+                        (cur ? cur->bc_private.b.allocated : 0));
+                if (diff > 0 &&
+                    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) {
+                        /*
+                         * Ick gross gag me with a spoon.
+                         */
+                        ASSERT(0);      /* want to see if this ever happens! */
+                        while (diff > 0) {
+                                if (temp) {
+                                        temp--;
+                                        diff--;
+                                        if (!diff ||
+                                            !xfs_mod_incore_sb(ip->i_mount,
+                                                    XFS_SBS_FDBLOCKS, -diff, rsvd))
+                                                break;
+                                }
+                                if (temp2) {
+                                        temp2--;
+                                        diff--;
+                                        if (!diff ||
+                                            !xfs_mod_incore_sb(ip->i_mount,
+                                                    XFS_SBS_FDBLOCKS, -diff, rsvd))
+                                                break;
+                                }
+                        }
+                }
+                base = ip->i_df.if_u1.if_extents;
+                ep = &base[idx];
+                xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
+                xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_startblock(ep + 2, NULLSTARTBLOCK((int)temp2));
+                xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
+                        XFS_DATA_FORK);
+                *dnew = temp + temp2;
+                break;
+        case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+        case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+        case MASK(LEFT_CONTIG):
+        case MASK(RIGHT_CONTIG):
+                /*
+                 * These cases are all impossible.
+                 */
+                ASSERT(0);
+        }
+        *curp = cur;
+done:
+        *logflagsp = rval;
+        return error;
+#undef  LEFT
+#undef  RIGHT
+#undef  PREV
+#undef  MASK
+#undef  MASK2
+#undef  MASK3
+#undef  MASK4
+#undef  STATE_SET
+#undef  STATE_TEST
+#undef  STATE_SET_TEST
+#undef  SWITCH_STATE
+}
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_unwritten_real(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         **curp, /* if *curp is null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        int                     *logflagsp) /* inode logging flags */
+{
+        xfs_bmbt_rec_t          *base;  /* base of extent entry list */
+        xfs_btree_cur_t         *cur;   /* btree cursor */
+        xfs_bmbt_rec_t          *ep;    /* extent entry for idx */
+        int                     error;  /* error return value */
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_bmap_add_extent_unwritten_real";
+#endif
+        int                     i;      /* temp state */
+        xfs_fileoff_t           new_endoff;     /* end offset of new entry */
+        xfs_exntst_t            newext; /* new extent state */
+        xfs_exntst_t            oldext; /* old extent state */
+        xfs_bmbt_irec_t         r[3];   /* neighbor extent entries */
+                                        /* left is 0, right is 1, prev is 2 */
+        int                     rval=0; /* return value (logging flags) */
+        int                     state = 0;/* state bits, accessed thru macros */
+        enum {                          /* bit number definitions for state */
+                LEFT_CONTIG,    RIGHT_CONTIG,
+                LEFT_FILLING,   RIGHT_FILLING,
+                LEFT_DELAY,     RIGHT_DELAY,
+                LEFT_VALID,     RIGHT_VALID
+        };
+#define LEFT            r[0]
+#define RIGHT           r[1]
+#define PREV            r[2]
+#define MASK(b)         (1 << (b))
+#define MASK2(a,b)      (MASK(a) | MASK(b))
+#define MASK3(a,b,c)    (MASK2(a,b) | MASK(c))
+#define MASK4(a,b,c,d)  (MASK3(a,b,c) | MASK(d))
+#define STATE_SET(b,v)  ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)   (state & MASK(b))
+#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
+                                       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE            \
+        (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
+        /*
+         * Set up a bunch of variables to make the tests simpler.
+         */
+        error = 0;
+        cur = *curp;
+        base = ip->i_df.if_u1.if_extents;
+        ep = &base[idx];
+        xfs_bmbt_get_all(ep, &PREV);
+        newext = new->br_state;
+        oldext = (newext == XFS_EXT_UNWRITTEN) ?
+                XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+        ASSERT(PREV.br_state == oldext);
+        new_endoff = new->br_startoff + new->br_blockcount;
+        ASSERT(PREV.br_startoff <= new->br_startoff);
+        ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+        /*
+         * Set flags determining what part of the previous oldext allocation
+         * extent is being replaced by a newext allocation.
+         */
+        STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+        STATE_SET(RIGHT_FILLING,
+                PREV.br_startoff + PREV.br_blockcount == new_endoff);
+        /*
+         * Check and set flags if this segment has a left neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         */
+        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+                xfs_bmbt_get_all(ep - 1, &LEFT);
+                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+        }
+        STATE_SET(LEFT_CONTIG,
+                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+                LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+                LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+                LEFT.br_state == newext &&
+                LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+        /*
+         * Check and set flags if this segment has a right neighbor.
+         * Don't set contiguous if the combined extent would be too large.
+         * Also check for all-three-contiguous being too large.
+         */
+        if (STATE_SET_TEST(RIGHT_VALID,
+                        idx <
+                        ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+                xfs_bmbt_get_all(ep + 1, &RIGHT);
+                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+        }
+        STATE_SET(RIGHT_CONTIG,
+                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+                new_endoff == RIGHT.br_startoff &&
+                new->br_startblock + new->br_blockcount ==
+                    RIGHT.br_startblock &&
+                newext == RIGHT.br_state &&
+                new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+                ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+                  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+                 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+                     <= MAXEXTLEN));
+        /*
+         * Switch out based on the FILLING and CONTIG state bits.
+         */
+        switch (SWITCH_STATE) {
+        case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * The left and right neighbors are both contiguous with new.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        LEFT.br_blockcount + PREV.br_blockcount +
+                        RIGHT.br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
+                        XFS_DATA_FORK);
+                xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx - 1;
+                ip->i_d.di_nextents -= 2;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_delete(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_delete(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                LEFT.br_startblock,
+                                LEFT.br_blockcount + PREV.br_blockcount +
+                                RIGHT.br_blockcount, LEFT.br_state)))
+                                goto done;
+                }
+                break;
+        case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * The left neighbor is contiguous, the right is not.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        LEFT.br_blockcount + PREV.br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx - 1;
+                xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+                ip->i_d.di_nextents--;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_delete(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+                                LEFT.br_startblock,
+                                LEFT.br_blockcount + PREV.br_blockcount,
+                                LEFT.br_state)))
+                                goto done;
+                }
+                break;
+        case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * The right neighbor is contiguous, the left is not.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount + RIGHT.br_blockcount);
+                xfs_bmbt_set_state(ep, newext);
+                xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
+                ip->i_d.di_nextents--;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+                                        RIGHT.br_startblock,
+                                        RIGHT.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_delete(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                new->br_startblock,
+                                new->br_blockcount + RIGHT.br_blockcount,
+                                newext)))
+                                goto done;
+                }
+                break;
+        case MASK2(LEFT_FILLING, RIGHT_FILLING):
+                /*
+                 * Setting all of a previous oldext extent to newext.
+                 * Neither the left nor right neighbors are contiguous with
+                 * the new one.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_state(ep, newext);
+                xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                new->br_startblock, new->br_blockcount,
+                                newext)))
+                                goto done;
+                }
+                break;
+        case MASK2(LEFT_FILLING, LEFT_CONTIG):
+                /*
+                 * Setting the first part of a previous oldext extent to newext.
+                 * The left neighbor is contiguous.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        LEFT.br_blockcount + new->br_blockcount);
+                xfs_bmbt_set_startoff(ep,
+                        PREV.br_startoff + new->br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_startblock(ep,
+                        new->br_startblock + new->br_blockcount);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx - 1;
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur,
+                                PREV.br_startoff + new->br_blockcount,
+                                PREV.br_startblock + new->br_blockcount,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                                goto done;
+                        if (xfs_bmbt_update(cur, LEFT.br_startoff,
+                                LEFT.br_startblock,
+                                LEFT.br_blockcount + new->br_blockcount,
+                                LEFT.br_state))
+                                goto done;
+                }
+                break;
+        case MASK(LEFT_FILLING):
+                /*
+                 * Setting the first part of a previous oldext extent to newext.
+                 * The left neighbor is not contiguous.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+                ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+                xfs_bmbt_set_startoff(ep, new_endoff);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                xfs_bmbt_set_startblock(ep,
+                        new->br_startblock + new->br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+                xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
+                        XFS_DATA_FORK);
+                xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur,
+                                PREV.br_startoff + new->br_blockcount,
+                                PREV.br_startblock + new->br_blockcount,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        cur->bc_rec.b = *new;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                break;
+        case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+                /*
+                 * Setting the last part of a previous oldext extent to newext.
+                 * The right neighbor is contiguous with the new allocation.
+                 */
+                xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+                        new->br_blockcount + RIGHT.br_blockcount, newext);
+                xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx + 1;
+                if (cur == NULL)
+                        rval = XFS_ILOG_DEXT;
+                else {
+                        rval = 0;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock,
+                                        PREV.br_blockcount, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                                PREV.br_startblock,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                                goto done;
+                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
+                                new->br_startblock,
+                                new->br_blockcount + RIGHT.br_blockcount,
+                                newext)))
+                                goto done;
+                }
+                break;
+        case MASK(RIGHT_FILLING):
+                /*
+                 * Setting the last part of a previous oldext extent to newext.
+                 * The right neighbor is not contiguous.
+                 */
+                xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep,
+                        PREV.br_blockcount - new->br_blockcount);
+                xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+                xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
+                        new, NULL, XFS_DATA_FORK);
+                xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx + 1;
+                ip->i_d.di_nextents++;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+                                PREV.br_startblock,
+                                PREV.br_blockcount - new->br_blockcount,
+                                oldext)))
+                                goto done;
+                        if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                        new->br_startblock, new->br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 0);
+                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                break;
+        case 0:
+                /*
+                 * Setting the middle part of a previous oldext extent to
+                 * newext.  Contiguity is impossible here.
+                 * One extent becomes three extents.
+                 */
+                xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep,
+                        new->br_startoff - PREV.br_startoff);
+                xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
+                r[0] = *new;
+                r[1].br_startoff = new_endoff;
+                r[1].br_blockcount =
+                        PREV.br_startoff + PREV.br_blockcount - new_endoff;
+                r[1].br_startblock = new->br_startblock + new->br_blockcount;
+                r[1].br_state = oldext;
+                xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
+                        XFS_DATA_FORK);
+                xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx + 1;
+                ip->i_d.di_nextents += 2;
+                if (cur == NULL)
+                        rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+                else {
+                        rval = XFS_ILOG_CORE;
+                        if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+                                        PREV.br_startblock, PREV.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        /* new right extent - oldext */
+                        if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+                                r[1].br_startblock, r[1].br_blockcount,
+                                r[1].br_state)))
+                                goto done;
+                        /* new left extent - oldext */
+                        PREV.br_blockcount =
+                                new->br_startoff - PREV.br_startoff;
+                        cur->bc_rec.b = PREV;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                        /* new middle extent - newext */
+                        cur->bc_rec.b = *new;
+                        if ((error = xfs_bmbt_insert(cur, &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                break;
+        case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+        case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+        case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+        case MASK(LEFT_CONTIG):
+        case MASK(RIGHT_CONTIG):
+                /*
+                 * These cases are all impossible.
+                 */
+                ASSERT(0);
+        }
+        *curp = cur;
+done:
+        *logflagsp = rval;
+        return error;
+#undef  LEFT
+#undef  RIGHT
+#undef  PREV
+#undef  MASK
+#undef  MASK2
+#undef  MASK3
+#undef  MASK4
+#undef  STATE_SET
+#undef  STATE_TEST
+#undef  STATE_SET_TEST
+#undef  SWITCH_STATE
+}
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+/*ARGSUSED*/
+STATIC int                              /* error */
+xfs_bmap_add_extent_hole_delay(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         *cur,   /* if null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        int                     *logflagsp, /* inode logging flags */
+        int                     rsvd)           /* OK to allocate reserved blocks */
+{
+        xfs_bmbt_rec_t          *base;  /* base of extent entry list */
+        xfs_bmbt_rec_t          *ep;    /* extent list entry for idx */
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_bmap_add_extent_hole_delay";
+#endif
+        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
+        xfs_filblks_t           newlen=0;       /* new indirect size */
+        xfs_filblks_t           oldlen=0;       /* old indirect size */
+        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
+        int                     state;  /* state bits, accessed thru macros */
+        xfs_filblks_t           temp;   /* temp for indirect calculations */
+        enum {                          /* bit number definitions for state */
+                LEFT_CONTIG,    RIGHT_CONTIG,
+                LEFT_DELAY,     RIGHT_DELAY,
+                LEFT_VALID,     RIGHT_VALID
+        };
+#define MASK(b)                 (1 << (b))
+#define MASK2(a,b)              (MASK(a) | MASK(b))
+#define STATE_SET(b,v)          ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)           (state & MASK(b))
+#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
+                                       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE            (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
+        base = ip->i_df.if_u1.if_extents;
+        ep = &base[idx];
+        state = 0;
+        ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
+        /*
+         * Check and set flags if this segment has a left neighbor
+         */
+        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+                xfs_bmbt_get_all(ep - 1, &left);
+                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+        }
+        /*
+         * Check and set flags if the current (right) segment exists.
+         * If it doesn't exist, we're converting the hole at end-of-file.
+         */
+        if (STATE_SET_TEST(RIGHT_VALID,
+                           idx <
+                           ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+                xfs_bmbt_get_all(ep, &right);
+                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+        }
+        /*
+         * Set contiguity flags on the left and right neighbors.
+         * Don't let extents get too large, even if the pieces are contiguous.
+         */
+        STATE_SET(LEFT_CONTIG,
+                STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) &&
+                left.br_startoff + left.br_blockcount == new->br_startoff &&
+                left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+        STATE_SET(RIGHT_CONTIG,
+                STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) &&
+                new->br_startoff + new->br_blockcount == right.br_startoff &&
+                new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+                (!STATE_TEST(LEFT_CONTIG) ||
+                 (left.br_blockcount + new->br_blockcount +
+                     right.br_blockcount <= MAXEXTLEN)));
+        /*
+         * Switch out based on the contiguity flags.
+         */
+        switch (SWITCH_STATE) {
+        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+                /*
+                 * New allocation is contiguous with delayed allocations
+                 * on the left and on the right.
+                 * Merge all three into a single extent list entry.
+                 */
+                temp = left.br_blockcount + new->br_blockcount +
+                        right.br_blockcount;
+                xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1, temp);
+                oldlen = STARTBLOCKVAL(left.br_startblock) +
+                        STARTBLOCKVAL(new->br_startblock) +
+                        STARTBLOCKVAL(right.br_startblock);
+                newlen = xfs_bmap_worst_indlen(ip, temp);
+                xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
+                xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1,
+                        XFS_DATA_FORK);
+                xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx - 1;
+                break;
+        case MASK(LEFT_CONTIG):
+                /*
+                 * New allocation is contiguous with a delayed allocation
+                 * on the left.
+                 * Merge the new allocation with the left neighbor.
+                 */
+                temp = left.br_blockcount + new->br_blockcount;
+                xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                xfs_bmbt_set_blockcount(ep - 1, temp);
+                oldlen = STARTBLOCKVAL(left.br_startblock) +
+                        STARTBLOCKVAL(new->br_startblock);
+                newlen = xfs_bmap_worst_indlen(ip, temp);
+                xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
+                xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
+                        XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx - 1;
+                break;
+        case MASK(RIGHT_CONTIG):
+                /*
+                 * New allocation is contiguous with a delayed allocation
+                 * on the right.
+                 * Merge the new allocation with the right neighbor.
+                 */
+                xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK);
+                temp = new->br_blockcount + right.br_blockcount;
+                oldlen = STARTBLOCKVAL(new->br_startblock) +
+                        STARTBLOCKVAL(right.br_startblock);
+                newlen = xfs_bmap_worst_indlen(ip, temp);
+                xfs_bmbt_set_allf(ep, new->br_startoff,
+                        NULLSTARTBLOCK((int)newlen), temp, right.br_state);
+                xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                break;
+        case 0:
+                /*
+                 * New allocation is not contiguous with another
+                 * delayed allocation.
+                 * Insert a new entry.
+                 */
+                oldlen = newlen = 0;
+                xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
+                        XFS_DATA_FORK);
+                xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+                ip->i_df.if_lastex = idx;
+                break;
+        }
+        if (oldlen != newlen) {
+                ASSERT(oldlen > newlen);
+                xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+                        (int)(oldlen - newlen), rsvd);
+                /*
+                 * Nothing to do for disk quota accounting here.
+                 */
+        }
+        *logflagsp = 0;
+        return 0;
+#undef  MASK
+#undef  MASK2
+#undef  STATE_SET
+#undef  STATE_TEST
+#undef  STATE_SET_TEST
+#undef  SWITCH_STATE
+}
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int                              /* error */
+xfs_bmap_add_extent_hole_real(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_extnum_t            idx,    /* extent number to update/insert */
+        xfs_btree_cur_t         *cur,   /* if null, not a btree */
+        xfs_bmbt_irec_t         *new,   /* new data to put in extent list */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork) /* data or attr fork */
+{
+        xfs_bmbt_rec_t          *ep;    /* pointer to extent entry ins. point */
+        int                     error;  /* error return value */
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_bmap_add_extent_hole_real";
+#endif
+        int                     i;      /* temp state */
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        xfs_bmbt_irec_t         left;   /* left neighbor extent entry */
+        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
+        int                     state;  /* state bits, accessed thru macros */
+        enum {                          /* bit number definitions for state */
+                LEFT_CONTIG,    RIGHT_CONTIG,
+                LEFT_DELAY,     RIGHT_DELAY,
+                LEFT_VALID,     RIGHT_VALID
+        };
+#define MASK(b)                 (1 << (b))
+#define MASK2(a,b)              (MASK(a) | MASK(b))
+#define STATE_SET(b,v)          ((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)           (state & MASK(b))
+#define STATE_SET_TEST(b,v)     ((v) ? ((state |= MASK(b)), 1) : \
+                                       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE            (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+        ep = &ifp->if_u1.if_extents[idx];
+        state = 0;
+        /*
+         * Check and set flags if this segment has a left neighbor.
+         */
+        if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+                xfs_bmbt_get_all(ep - 1, &left);
+                STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+        }
+        /*
+         * Check and set flags if this segment has a current value.
+         * Not true if we're inserting into the "hole" at eof.
+         */
+        if (STATE_SET_TEST(RIGHT_VALID,
+                           idx <
+                           ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+                xfs_bmbt_get_all(ep, &right);
+                STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+        }
+        /*
+         * We're inserting a real allocation between "left" and "right".
+         * Set the contiguity flags.  Don't let extents get too large.
+         */
+        STATE_SET(LEFT_CONTIG,
+                STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+                left.br_startoff + left.br_blockcount == new->br_startoff &&
+                left.br_startblock + left.br_blockcount == new->br_startblock &&
+                left.br_state == new->br_state &&
+                left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+        STATE_SET(RIGHT_CONTIG,
+                STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+                new->br_startoff + new->br_blockcount == right.br_startoff &&
+                new->br_startblock + new->br_blockcount ==
+                    right.br_startblock &&
+                new->br_state == right.br_state &&
+                new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+                (!STATE_TEST(LEFT_CONTIG) ||
+                 left.br_blockcount + new->br_blockcount +
+                     right.br_blockcount <= MAXEXTLEN));
+        /*
+         * Select which case we're in here, and implement it.
+         */
+        switch (SWITCH_STATE) {
+        case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+                /*
+                 * New allocation is contiguous with real allocations on the
+                 * left and on the right.
+                 * Merge all three into a single extent list entry.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
+                        whichfork);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        left.br_blockcount + new->br_blockcount +
+                        right.br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
+                        whichfork);
+                xfs_bmap_trace_delete(fname, "LC|RC", ip,
+                        idx, 1, whichfork);
+                xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
+                ifp->if_lastex = idx - 1;
+                XFS_IFORK_NEXT_SET(ip, whichfork,
+                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+                if (cur == NULL) {
+                        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                        return 0;
+                }
+                *logflagsp = XFS_ILOG_CORE;
+                if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+                                right.br_startblock, right.br_blockcount, &i)))
+                        return error;
+                ASSERT(i == 1);
+                if ((error = xfs_bmbt_delete(cur, &i)))
+                        return error;
+                ASSERT(i == 1);
+                if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        return error;
+                ASSERT(i == 1);
+                error = xfs_bmbt_update(cur, left.br_startoff,
+                                left.br_startblock,
+                                left.br_blockcount + new->br_blockcount +
+                                right.br_blockcount, left.br_state);
+                return error;
+        case MASK(LEFT_CONTIG):
+                /*
+                 * New allocation is contiguous with a real allocation
+                 * on the left.
+                 * Merge the new allocation with the left neighbor.
+                 */
+                xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork);
+                xfs_bmbt_set_blockcount(ep - 1,
+                        left.br_blockcount + new->br_blockcount);
+                xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
+                ifp->if_lastex = idx - 1;
+                if (cur == NULL) {
+                        *logflagsp = XFS_ILOG_FEXT(whichfork);
+                        return 0;
+                }
+                *logflagsp = 0;
+                if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
+                                left.br_startblock, left.br_blockcount, &i)))
+                        return error;
+                ASSERT(i == 1);
+                error = xfs_bmbt_update(cur, left.br_startoff,
+                                left.br_startblock,
+                                left.br_blockcount + new->br_blockcount,
+                                left.br_state);
+                return error;
+        case MASK(RIGHT_CONTIG):
+                /*
+                 * New allocation is contiguous with a real allocation
+                 * on the right.
+                 * Merge the new allocation with the right neighbor.
+                 */
+                xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork);
+                xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+                        new->br_blockcount + right.br_blockcount,
+                        right.br_state);
+                xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
+                ifp->if_lastex = idx;
+                if (cur == NULL) {
+                        *logflagsp = XFS_ILOG_FEXT(whichfork);
+                        return 0;
+                }
+                *logflagsp = 0;
+                if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+                                right.br_startblock, right.br_blockcount, &i)))
+                        return error;
+                ASSERT(i == 1);
+                error = xfs_bmbt_update(cur, new->br_startoff,
+                                new->br_startblock,
+                                new->br_blockcount + right.br_blockcount,
+                                right.br_state);
+                return error;
+        case 0:
+                /*
+                 * New allocation is not contiguous with another
+                 * real allocation.
+                 * Insert a new entry.
+                 */
+                xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
+                        whichfork);
+                xfs_bmap_insert_exlist(ip, idx, 1, new, whichfork);
+                ifp->if_lastex = idx;
+                XFS_IFORK_NEXT_SET(ip, whichfork,
+                        XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+                if (cur == NULL) {
+                        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+                        return 0;
+                }
+                *logflagsp = XFS_ILOG_CORE;
+                if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+                                new->br_startblock, new->br_blockcount, &i)))
+                        return error;
+                ASSERT(i == 0);
+                cur->bc_rec.b.br_state = new->br_state;
+                if ((error = xfs_bmbt_insert(cur, &i)))
+                        return error;
+                ASSERT(i == 1);
+                return 0;
+        }
+#undef  MASK
+#undef  MASK2
+#undef  STATE_SET
+#undef  STATE_TEST
+#undef  STATE_SET_TEST
+#undef  SWITCH_STATE
+        /* NOTREACHED */
+        ASSERT(0);
+        return 0; /* keep gcc quite */
+}
+#define XFS_ALLOC_GAP_UNITS     4
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int                              /* error */
+xfs_bmap_alloc(
+        xfs_bmalloca_t  *ap)            /* bmap alloc argument struct */
+{
+        xfs_fsblock_t   adjust;         /* adjustment to block numbers */
+        xfs_alloctype_t atype=0;        /* type for allocation routines */
+        int             error;          /* error return value */
+        xfs_agnumber_t  fb_agno;        /* ag number of ap->firstblock */
+        xfs_mount_t     *mp;            /* mount point structure */
+        int             nullfb;         /* true if ap->firstblock isn't set */
+        int             rt;             /* true if inode is realtime */
+#ifdef __KERNEL__
+        xfs_extlen_t    prod=0;         /* product factor for allocators */
+        xfs_extlen_t    ralen=0;        /* realtime allocation length */
+#endif
+#define ISVALID(x,y)    \
+        (rt ? \
+                (x) < mp->m_sb.sb_rblocks : \
+                XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+                XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+                XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+        /*
+         * Set up variables.
+         */
+        mp = ap->ip->i_mount;
+        nullfb = ap->firstblock == NULLFSBLOCK;
+        rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata;
+        fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+#ifdef __KERNEL__
+        if (rt) {
+                xfs_extlen_t    extsz;          /* file extent size for rt */
+                xfs_fileoff_t   nexto;          /* next file offset */
+                xfs_extlen_t    orig_alen;      /* original ap->alen */
+                xfs_fileoff_t   orig_end;       /* original off+len */
+                xfs_fileoff_t   orig_off;       /* original ap->off */
+                xfs_extlen_t    mod_off;        /* modulus calculations */
+                xfs_fileoff_t   prevo;          /* previous file offset */
+                xfs_rtblock_t   rtx;            /* realtime extent number */
+                xfs_extlen_t    temp;           /* temp for rt calculations */
+                /*
+                 * Set prod to match the realtime extent size.
+                 */
+                if (!(extsz = ap->ip->i_d.di_extsize))
+                        extsz = mp->m_sb.sb_rextsize;
+                prod = extsz / mp->m_sb.sb_rextsize;
+                orig_off = ap->off;
+                orig_alen = ap->alen;
+                orig_end = orig_off + orig_alen;
+                /*
+                 * If the file offset is unaligned vs. the extent size
+                 * we need to align it.  This will be possible unless
+                 * the file was previously written with a kernel that didn't
+                 * perform this alignment.
+                 */
+                mod_off = do_mod(orig_off, extsz);
+                if (mod_off) {
+                        ap->alen += mod_off;
+                        ap->off -= mod_off;
+                }
+                /*
+                 * Same adjustment for the end of the requested area.
+                 */
+                if ((temp = (ap->alen % extsz)))
+                        ap->alen += extsz - temp;
+                /*
+                 * If the previous block overlaps with this proposed allocation
+                 * then move the start forward without adjusting the length.
+                 */
+                prevo =
+                        ap->prevp->br_startoff == NULLFILEOFF ?
+                                0 :
+                                (ap->prevp->br_startoff +
+                                 ap->prevp->br_blockcount);
+                if (ap->off != orig_off && ap->off < prevo)
+                        ap->off = prevo;
+                /*
+                 * If the next block overlaps with this proposed allocation
+                 * then move the start back without adjusting the length,
+                 * but not before offset 0.
+                 * This may of course make the start overlap previous block,
+                 * and if we hit the offset 0 limit then the next block
+                 * can still overlap too.
+                 */
+                nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ?
+                        NULLFILEOFF : ap->gotp->br_startoff;
+                if (!ap->eof &&
+                    ap->off + ap->alen != orig_end &&
+                    ap->off + ap->alen > nexto)
+                        ap->off = nexto > ap->alen ? nexto - ap->alen : 0;
+                /*
+                 * If we're now overlapping the next or previous extent that
+                 * means we can't fit an extsz piece in this hole.  Just move
+                 * the start forward to the first valid spot and set
+                 * the length so we hit the end.
+                 */
+                if ((ap->off != orig_off && ap->off < prevo) ||
+                    (ap->off + ap->alen != orig_end &&
+                     ap->off + ap->alen > nexto)) {
+                        ap->off = prevo;
+                        ap->alen = nexto - prevo;
+                }
+                /*
+                 * If the result isn't a multiple of rtextents we need to
+                 * remove blocks until it is.
+                 */
+                if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) {
+                        /*
+                         * We're not covering the original request, or
+                         * we won't be able to once we fix the length.
+                         */
+                        if (orig_off < ap->off ||
+                            orig_end > ap->off + ap->alen ||
+                            ap->alen - temp < orig_alen)
+                                return XFS_ERROR(EINVAL);
+                        /*
+                         * Try to fix it by moving the start up.
+                         */
+                        if (ap->off + temp <= orig_off) {
+                                ap->alen -= temp;
+                                ap->off += temp;
+                        }
+                        /*
+                         * Try to fix it by moving the end in.
+                         */
+                        else if (ap->off + ap->alen - temp >= orig_end)
+                                ap->alen -= temp;
+                        /*
+                         * Set the start to the minimum then trim the length.
+                         */
+                        else {
+                                ap->alen -= orig_off - ap->off;
+                                ap->off = orig_off;
+                                ap->alen -= ap->alen % mp->m_sb.sb_rextsize;
+                        }
+                        /*
+                         * Result doesn't cover the request, fail it.
+                         */
+                        if (orig_off < ap->off || orig_end > ap->off + ap->alen)
+                                return XFS_ERROR(EINVAL);
+                }
+                ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+                /*
+                 * If the offset & length are not perfectly aligned
+                 * then kill prod, it will just get us in trouble.
+                 */
+                if (do_mod(ap->off, extsz) || ap->alen % extsz)
+                        prod = 1;
+                /*
+                 * Set ralen to be the actual requested length in rtextents.
+                 */
+                ralen = ap->alen / mp->m_sb.sb_rextsize;
+                /*
+                 * If the old value was close enough to MAXEXTLEN that
+                 * we rounded up to it, cut it back so it's valid again.
+                 * Note that if it's a really large request (bigger than
+                 * MAXEXTLEN), we don't hear about that number, and can't
+                 * adjust the starting point to match it.
+                 */
+                if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+                        ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+                /*
+                 * If it's an allocation to an empty file at offset 0,
+                 * pick an extent that will space things out in the rt area.
+                 */
+                if (ap->eof && ap->off == 0) {
+                        error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+                        if (error)
+                                return error;
+                        ap->rval = rtx * mp->m_sb.sb_rextsize;
+                } else
+                        ap->rval = 0;
+        }
+#else
+        if (rt)
+                ap->rval = 0;
+#endif  /* __KERNEL__ */
+        else if (nullfb)
+                ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+        else
+                ap->rval = ap->firstblock;
+        /*
+         * If allocating at eof, and there's a previous real block,
+         * try to use it's last block as our starting point.
+         */
+        if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
+            !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+            ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
+                    ap->prevp->br_startblock)) {
+                ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+                /*
+                 * Adjust for the gap between prevp and us.
+                 */
+                adjust = ap->off -
+                        (ap->prevp->br_startoff + ap->prevp->br_blockcount);
+                if (adjust &&
+                    ISVALID(ap->rval + adjust, ap->prevp->br_startblock))
+                        ap->rval += adjust;
+        }
+        /*
+         * If not at eof, then compare the two neighbor blocks.
+         * Figure out whether either one gives us a good starting point,
+         * and pick the better one.
+         */
+        else if (!ap->eof) {
+                xfs_fsblock_t   gotbno;         /* right side block number */
+                xfs_fsblock_t   gotdiff=0;      /* right side difference */
+                xfs_fsblock_t   prevbno;        /* left side block number */
+                xfs_fsblock_t   prevdiff=0;     /* left side difference */
+                /*
+                 * If there's a previous (left) block, select a requested
+                 * start block based on it.
+                 */
+                if (ap->prevp->br_startoff != NULLFILEOFF &&
+                    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+                    (prevbno = ap->prevp->br_startblock +
+                               ap->prevp->br_blockcount) &&
+                    ISVALID(prevbno, ap->prevp->br_startblock)) {
+                        /*
+                         * Calculate gap to end of previous block.
+                         */
+                        adjust = prevdiff = ap->off -
+                                (ap->prevp->br_startoff +
+                                 ap->prevp->br_blockcount);
+                        /*
+                         * Figure the startblock based on the previous block's
+                         * end and the gap size.
+                         * Heuristic!
+                         * If the gap is large relative to the piece we're
+                         * allocating, or using it gives us an invalid block
+                         * number, then just use the end of the previous block.
+                         */
+                        if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+                            ISVALID(prevbno + prevdiff,
+                                    ap->prevp->br_startblock))
+                                prevbno += adjust;
+                        else
+                                prevdiff += adjust;
+                        /*
+                         * If the firstblock forbids it, can't use it,
+                         * must use default.
+                         */
+                        if (!rt && !nullfb &&
+                            XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+                                prevbno = NULLFSBLOCK;
+                }
+                /*
+                 * No previous block or can't follow it, just default.
+                 */
+                else
+                        prevbno = NULLFSBLOCK;
+                /*
+                 * If there's a following (right) block, select a requested
+                 * start block based on it.
+                 */
+                if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
+                        /*
+                         * Calculate gap to start of next block.
+                         */
+                        adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+                        /*
+                         * Figure the startblock based on the next block's
+                         * start and the gap size.
+                         */
+                        gotbno = ap->gotp->br_startblock;
+                        /*
+                         * Heuristic!
+                         * If the gap is large relative to the piece we're
+                         * allocating, or using it gives us an invalid block
+                         * number, then just use the start of the next block
+                         * offset by our length.
+                         */
+                        if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+                            ISVALID(gotbno - gotdiff, gotbno))
+                                gotbno -= adjust;
+                        else if (ISVALID(gotbno - ap->alen, gotbno)) {
+                                gotbno -= ap->alen;
+                                gotdiff += adjust - ap->alen;
+                        } else
+                                gotdiff += adjust;
+                        /*
+                         * If the firstblock forbids it, can't use it,
+                         * must use default.
+                         */
+                        if (!rt && !nullfb &&
+                            XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+                                gotbno = NULLFSBLOCK;
+                }
+                /*
+                 * No next block, just default.
+                 */
+                else
+                        gotbno = NULLFSBLOCK;
+                /*
+                 * If both valid, pick the better one, else the only good
+                 * one, else ap->rval is already set (to 0 or the inode block).
+                 */
+                if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+                        ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+                else if (prevbno != NULLFSBLOCK)
+                        ap->rval = prevbno;
+                else if (gotbno != NULLFSBLOCK)
+                        ap->rval = gotbno;
+        }
+        /*
+         * If allowed, use ap->rval; otherwise must use firstblock since
+         * it's in the right allocation group.
+         */
+        if (nullfb || rt || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+                ;
+        else
+                ap->rval = ap->firstblock;
+        /*
+         * Realtime allocation, done through xfs_rtallocate_extent.
+         */
+        if (rt) {
+#ifndef __KERNEL__
+                ASSERT(0);
+#else
+                xfs_rtblock_t   rtb;
+                atype = ap->rval == 0 ?
+                        XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+                do_div(ap->rval, mp->m_sb.sb_rextsize);
+                rtb = ap->rval;
+                ap->alen = ralen;
+                if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+                                &ralen, atype, ap->wasdel, prod, &rtb)))
+                        return error;
+                if (rtb == NULLFSBLOCK && prod > 1 &&
+                    (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
+                                                   ap->alen, &ralen, atype,
+                                                   ap->wasdel, 1, &rtb)))
+                        return error;
+                ap->rval = rtb;
+                if (ap->rval != NULLFSBLOCK) {
+                        ap->rval *= mp->m_sb.sb_rextsize;
+                        ralen *= mp->m_sb.sb_rextsize;
+                        ap->alen = ralen;
+                        ap->ip->i_d.di_nblocks += ralen;
+                        xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+                        if (ap->wasdel)
+                                ap->ip->i_delayed_blks -= ralen;
+                        /*
+                         * Adjust the disk quota also. This was reserved
+                         * earlier.
+                         */
+                        XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+                                ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
+                                                XFS_TRANS_DQ_RTBCOUNT,
+                                (long) ralen);
+                } else
+                        ap->alen = 0;
+#endif  /* __KERNEL__ */
+        }
+        /*
+         * Normal allocation, done through xfs_alloc_vextent.
+         */
+        else {
+                xfs_agnumber_t  ag;
+                xfs_alloc_arg_t args;
+                xfs_extlen_t    blen;
+                xfs_extlen_t    delta;
+                int             isaligned;
+                xfs_extlen_t    longest;
+                xfs_extlen_t    need;
+                xfs_extlen_t    nextminlen=0;
+                int             notinit;
+                xfs_perag_t     *pag;
+                xfs_agnumber_t  startag;
+                int             tryagain;
+                tryagain = isaligned = 0;
+                args.tp = ap->tp;
+                args.mp = mp;
+                args.fsbno = ap->rval;
+                args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+                blen = 0;
+                if (nullfb) {
+                        args.type = XFS_ALLOCTYPE_START_BNO;
+                        args.total = ap->total;
+                        /*
+                         * Find the longest available space.
+                         * We're going to try for the whole allocation at once.
+                         */
+                        startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
+                        notinit = 0;
+                        down_read(&mp->m_peraglock);
+                        while (blen < ap->alen) {
+                                pag = &mp->m_perag[ag];
+                                if (!pag->pagf_init &&
+                                    (error = xfs_alloc_pagf_init(mp, args.tp,
+                                            ag, XFS_ALLOC_FLAG_TRYLOCK))) {
+                                        up_read(&mp->m_peraglock);
+                                        return error;
+                                }
+                                /*
+                                 * See xfs_alloc_fix_freelist...
+                                 */
+                                if (pag->pagf_init) {
+                                        need = XFS_MIN_FREELIST_PAG(pag, mp);
+                                        delta = need > pag->pagf_flcount ?
+                                                need - pag->pagf_flcount : 0;
+                                        longest = (pag->pagf_longest > delta) ?
+                                                (pag->pagf_longest - delta) :
+                                                (pag->pagf_flcount > 0 ||
+                                                 pag->pagf_longest > 0);
+                                        if (blen < longest)
+                                                blen = longest;
+                                } else
+                                        notinit = 1;
+                                if (++ag == mp->m_sb.sb_agcount)
+                                        ag = 0;
+                                if (ag == startag)
+                                        break;
+                        }
+                        up_read(&mp->m_peraglock);
+                        /*
+                         * Since the above loop did a BUF_TRYLOCK, it is
+                         * possible that there is space for this request.
+                         */
+                        if (notinit || blen < ap->minlen)
+                                args.minlen = ap->minlen;
+                        /*
+                         * If the best seen length is less than the request
+                         * length, use the best as the minimum.
+                         */
+                        else if (blen < ap->alen)
+                                args.minlen = blen;
+                        /*
+                         * Otherwise we've seen an extent as big as alen,
+                         * use that as the minimum.
+                         */
+                        else
+                                args.minlen = ap->alen;
+                } else if (ap->low) {
+                        args.type = XFS_ALLOCTYPE_FIRST_AG;
+                        args.total = args.minlen = ap->minlen;
+                } else {
+                        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                        args.total = ap->total;
+                        args.minlen = ap->minlen;
+                }
+                if (ap->ip->i_d.di_extsize) {
+                        args.prod = ap->ip->i_d.di_extsize;
+                        if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+                                args.mod = (xfs_extlen_t)(args.prod - args.mod);
+                } else if (mp->m_sb.sb_blocksize >= NBPP) {
+                        args.prod = 1;
+                        args.mod = 0;
+                } else {
+                        args.prod = NBPP >> mp->m_sb.sb_blocklog;
+                        if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+                                args.mod = (xfs_extlen_t)(args.prod - args.mod);
+                }
+                /*
+                 * If we are not low on available data blocks, and the
+                 * underlying logical volume manager is a stripe, and
+                 * the file offset is zero then try to allocate data
+                 * blocks on stripe unit boundary.
+                 * NOTE: ap->aeof is only set if the allocation length
+                 * is >= the stripe unit and the allocation offset is
+                 * at the end of file.
+                 */
+                if (!ap->low && ap->aeof) {
+                        if (!ap->off) {
+                                args.alignment = mp->m_dalign;
+                                atype = args.type;
+                                isaligned = 1;
+                                /*
+                                 * Adjust for alignment
+                                 */
+                                if (blen > args.alignment && blen <= ap->alen)
+                                        args.minlen = blen - args.alignment;
+                                args.minalignslop = 0;
+                        } else {
+                                /*
+                                 * First try an exact bno allocation.
+                                 * If it fails then do a near or start bno
+                                 * allocation with alignment turned on.
+                                 */
+                                atype = args.type;
+                                tryagain = 1;
+                                args.type = XFS_ALLOCTYPE_THIS_BNO;
+                                args.alignment = 1;
+                                /*
+                                 * Compute the minlen+alignment for the
+                                 * next case.  Set slop so that the value
+                                 * of minlen+alignment+slop doesn't go up
+                                 * between the calls.
+                                 */
+                                if (blen > mp->m_dalign && blen <= ap->alen)
+                                        nextminlen = blen - mp->m_dalign;
+                                else
+                                        nextminlen = args.minlen;
+                                if (nextminlen + mp->m_dalign > args.minlen + 1)
+                                        args.minalignslop =
+                                                nextminlen + mp->m_dalign -
+                                                args.minlen - 1;
+                                else
+                                        args.minalignslop = 0;
+                        }
+                } else {
+                        args.alignment = 1;
+                        args.minalignslop = 0;
+                }
+                args.minleft = ap->minleft;
+                args.wasdel = ap->wasdel;
+                args.isfl = 0;
+                args.userdata = ap->userdata;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+                if (tryagain && args.fsbno == NULLFSBLOCK) {
+                        /*
+                         * Exact allocation failed. Now try with alignment
+                         * turned on.
+                         */
+                        args.type = atype;
+                        args.fsbno = ap->rval;
+                        args.alignment = mp->m_dalign;
+                        args.minlen = nextminlen;
+                        args.minalignslop = 0;
+                        isaligned = 1;
+                        if ((error = xfs_alloc_vextent(&args)))
+                                return error;
+                }
+                if (isaligned && args.fsbno == NULLFSBLOCK) {
+                        /*
+                         * allocation failed, so turn off alignment and
+                         * try again.
+                         */
+                        args.type = atype;
+                        args.fsbno = ap->rval;
+                        args.alignment = 0;
+                        if ((error = xfs_alloc_vextent(&args)))
+                                return error;
+                }
+                if (args.fsbno == NULLFSBLOCK && nullfb &&
+                    args.minlen > ap->minlen) {
+                        args.minlen = ap->minlen;
+                        args.type = XFS_ALLOCTYPE_START_BNO;
+                        args.fsbno = ap->rval;
+                        if ((error = xfs_alloc_vextent(&args)))
+                                return error;
+                }
+                if (args.fsbno == NULLFSBLOCK && nullfb) {
+                        args.fsbno = 0;
+                        args.type = XFS_ALLOCTYPE_FIRST_AG;
+                        args.total = ap->minlen;
+                        args.minleft = 0;
+                        if ((error = xfs_alloc_vextent(&args)))
+                                return error;
+                        ap->low = 1;
+                }
+                if (args.fsbno != NULLFSBLOCK) {
+                        ap->firstblock = ap->rval = args.fsbno;
+                        ASSERT(nullfb || fb_agno == args.agno ||
+                               (ap->low && fb_agno < args.agno));
+                        ap->alen = args.len;
+                        ap->ip->i_d.di_nblocks += args.len;
+                        xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+                        if (ap->wasdel)
+                                ap->ip->i_delayed_blks -= args.len;
+                        /*
+                         * Adjust the disk quota also. This was reserved
+                         * earlier.
+                         */
+                        XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+                                ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
+                                                XFS_TRANS_DQ_BCOUNT,
+                                (long) args.len);
+                } else {
+                        ap->rval = NULLFSBLOCK;
+                        ap->alen = 0;
+                }
+        }
+        return 0;
+#undef  ISVALID
+}
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the extent list is already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int                              /* error */
+xfs_bmap_btree_to_extents(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork)  /* data or attr fork */
+{
+        /* REFERENCED */
+        xfs_bmbt_block_t        *cblock;/* child btree block */
+        xfs_fsblock_t           cbno;   /* child block number */
+        xfs_buf_t               *cbp;   /* child block's buffer */
+        int                     error;  /* error return value */
+        xfs_ifork_t             *ifp;   /* inode fork data */
+        xfs_mount_t             *mp;    /* mount point structure */
+        xfs_bmbt_ptr_t          *pp;    /* ptr to block address */
+        xfs_bmbt_block_t        *rblock;/* root btree block */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+        rblock = ifp->if_broot;
+        ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) == 1);
+        ASSERT(INT_GET(rblock->bb_numrecs, ARCH_CONVERT) == 1);
+        ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
+        mp = ip->i_mount;
+        pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+        *logflagsp = 0;
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1)))
+                return error;
+#endif
+        cbno = INT_GET(*pp, ARCH_CONVERT);
+        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
+                        XFS_BMAP_BTREE_REF)))
+                return error;
+        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+        if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+                return error;
+        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+        ip->i_d.di_nblocks--;
+        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(tp, cbp);
+        if (cur->bc_bufs[0] == cbp)
+                cur->bc_bufs[0] = NULL;
+        xfs_iroot_realloc(ip, -1, whichfork);
+        ASSERT(ifp->if_broot == NULL);
+        ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+        return 0;
+}
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int                              /* error */
+xfs_bmap_del_extent(
+        xfs_inode_t             *ip,    /* incore inode pointer */
+        xfs_trans_t             *tp,    /* current transaction pointer */
+        xfs_extnum_t            idx,    /* extent number to update/delete */
+        xfs_bmap_free_t         *flist, /* list of extents to be freed */
+        xfs_btree_cur_t         *cur,   /* if null, not a btree */
+        xfs_bmbt_irec_t         *del,   /* data to remove from extent list */
+        int                     *logflagsp, /* inode logging flags */
+        int                     whichfork, /* data or attr fork */
+        int                     rsvd)   /* OK to allocate reserved blocks */
+{
+        xfs_filblks_t           da_new; /* new delay-alloc indirect blocks */
+        xfs_filblks_t           da_old; /* old delay-alloc indirect blocks */
+        xfs_fsblock_t           del_endblock=0; /* first block past del */
+        xfs_fileoff_t           del_endoff;     /* first offset past del */
+        int                     delay;  /* current block is delayed allocated */
+        int                     do_fx;  /* free extent at end of routine */
+        xfs_bmbt_rec_t          *ep;    /* current extent entry pointer */
+        int                     error;  /* error return value */
+        int                     flags;  /* inode logging flags */
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_bmap_del_extent";
+#endif
+        xfs_bmbt_irec_t         got;    /* current extent entry */
+        xfs_fileoff_t           got_endoff;     /* first offset past got */
+        int                     i;      /* temp state */
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        xfs_mount_t             *mp;    /* mount structure */
+        xfs_filblks_t           nblks;  /* quota/sb block count */
+        xfs_bmbt_irec_t         new;    /* new record to be inserted */
+        /* REFERENCED */
+        xfs_extnum_t            nextents;       /* number of extents in list */
+        uint                    qfield; /* quota field to update */
+        xfs_filblks_t           temp;   /* for indirect length calculations */
+        xfs_filblks_t           temp2;  /* for indirect length calculations */
+        XFS_STATS_INC(xs_del_exlist);
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        ASSERT(idx >= 0 && idx < nextents);
+        ASSERT(del->br_blockcount > 0);
+        ep = &ifp->if_u1.if_extents[idx];
+        xfs_bmbt_get_all(ep, &got);
+        ASSERT(got.br_startoff <= del->br_startoff);
+        del_endoff = del->br_startoff + del->br_blockcount;
+        got_endoff = got.br_startoff + got.br_blockcount;
+        ASSERT(got_endoff >= del_endoff);
+        delay = ISNULLSTARTBLOCK(got.br_startblock);
+        ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
+        flags = 0;
+        qfield = 0;
+        error = 0;
+        /*
+         * If deleting a real allocation, must free up the disk space.
+         */
+        if (!delay) {
+                flags = XFS_ILOG_CORE;
+                /*
+                 * Realtime allocation.  Free it and record di_nblocks update.
+                 */
+                if (whichfork == XFS_DATA_FORK &&
+                    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+                        xfs_fsblock_t   bno;
+                        xfs_filblks_t   len;
+                        ASSERT(do_mod(del->br_blockcount,
+                                      mp->m_sb.sb_rextsize) == 0);
+                        ASSERT(do_mod(del->br_startblock,
+                                      mp->m_sb.sb_rextsize) == 0);
+                        bno = del->br_startblock;
+                        len = del->br_blockcount;
+                        do_div(bno, mp->m_sb.sb_rextsize);
+                        do_div(len, mp->m_sb.sb_rextsize);
+                        if ((error = xfs_rtfree_extent(ip->i_transp, bno,
+                                        (xfs_extlen_t)len)))
+                                goto done;
+                        do_fx = 0;
+                        nblks = len * mp->m_sb.sb_rextsize;
+                        qfield = XFS_TRANS_DQ_RTBCOUNT;
+                }
+                /*
+                 * Ordinary allocation.
+                 */
+                else {
+                        do_fx = 1;
+                        nblks = del->br_blockcount;
+                        qfield = XFS_TRANS_DQ_BCOUNT;
+                }
+                /*
+                 * Set up del_endblock and cur for later.
+                 */
+                del_endblock = del->br_startblock + del->br_blockcount;
+                if (cur) {
+                        if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                        got.br_startblock, got.br_blockcount,
+                                        &i)))
+                                goto done;
+                        ASSERT(i == 1);
+                }
+                da_old = da_new = 0;
+        } else {
+                da_old = STARTBLOCKVAL(got.br_startblock);
+                da_new = 0;
+                nblks = 0;
+                do_fx = 0;
+        }
+        /*
+         * Set flag value to use in switch statement.
+         * Left-contig is 2, right-contig is 1.
+         */
+        switch (((got.br_startoff == del->br_startoff) << 1) |
+                (got_endoff == del_endoff)) {
+        case 3:
+                /*
+                 * Matches the whole extent.  Delete the entry.
+                 */
+                xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork);
+                xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
+                ifp->if_lastex = idx;
+                if (delay)
+                        break;
+                XFS_IFORK_NEXT_SET(ip, whichfork,
+                        XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+                flags |= XFS_ILOG_CORE;
+                if (!cur) {
+                        flags |= XFS_ILOG_FEXT(whichfork);
+                        break;
+                }
+                if ((error = xfs_bmbt_delete(cur, &i)))
+                        goto done;
+                ASSERT(i == 1);
+                break;
+        case 2:
+                /*
+                 * Deleting the first part of the extent.
+                 */
+                xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork);
+                xfs_bmbt_set_startoff(ep, del_endoff);
+                temp = got.br_blockcount - del->br_blockcount;
+                xfs_bmbt_set_blockcount(ep, temp);
+                ifp->if_lastex = idx;
+                if (delay) {
+                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                                da_old);
+                        xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                        xfs_bmap_trace_post_update(fname, "2", ip, idx,
+                                whichfork);
+                        da_new = temp;
+                        break;
+                }
+                xfs_bmbt_set_startblock(ep, del_endblock);
+                xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork);
+                if (!cur) {
+                        flags |= XFS_ILOG_FEXT(whichfork);
+                        break;
+                }
+                if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+                                got.br_blockcount - del->br_blockcount,
+                                got.br_state)))
+                        goto done;
+                break;
+        case 1:
+                /*
+                 * Deleting the last part of the extent.
+                 */
+                temp = got.br_blockcount - del->br_blockcount;
+                xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork);
+                xfs_bmbt_set_blockcount(ep, temp);
+                ifp->if_lastex = idx;
+                if (delay) {
+                        temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+                                da_old);
+                        xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                        xfs_bmap_trace_post_update(fname, "1", ip, idx,
+                                whichfork);
+                        da_new = temp;
+                        break;
+                }
+                xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork);
+                if (!cur) {
+                        flags |= XFS_ILOG_FEXT(whichfork);
+                        break;
+                }
+                if ((error = xfs_bmbt_update(cur, got.br_startoff,
+                                got.br_startblock,
+                                got.br_blockcount - del->br_blockcount,
+                                got.br_state)))
+                        goto done;
+                break;
+        case 0:
+                /*
+                 * Deleting the middle of the extent.
+                 */
+                temp = del->br_startoff - got.br_startoff;
+                xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork);
+                xfs_bmbt_set_blockcount(ep, temp);
+                new.br_startoff = del_endoff;
+                temp2 = got_endoff - del_endoff;
+                new.br_blockcount = temp2;
+                new.br_state = got.br_state;
+                if (!delay) {
+                        new.br_startblock = del_endblock;
+                        flags |= XFS_ILOG_CORE;
+                        if (cur) {
+                                if ((error = xfs_bmbt_update(cur,
+                                                got.br_startoff,
+                                                got.br_startblock, temp,
+                                                got.br_state)))
+                                        goto done;
+                                if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                                        goto done;
+                                cur->bc_rec.b = new;
+                                error = xfs_bmbt_insert(cur, &i);
+                                if (error && error != ENOSPC)
+                                        goto done;
+                                /*
+                                 * If get no-space back from btree insert,
+                                 * it tried a split, and we have a zero
+                                 * block reservation.
+                                 * Fix up our state and return the error.
+                                 */
+                                if (error == ENOSPC) {
+                                        /*
+                                         * Reset the cursor, don't trust
+                                         * it after any insert operation.
+                                         */
+                                        if ((error = xfs_bmbt_lookup_eq(cur,
+                                                        got.br_startoff,
+                                                        got.br_startblock,
+                                                        temp, &i)))
+                                                goto done;
+                                        ASSERT(i == 1);
+                                        /*
+                                         * Update the btree record back
+                                         * to the original value.
+                                         */
+                                        if ((error = xfs_bmbt_update(cur,
+                                                        got.br_startoff,
+                                                        got.br_startblock,
+                                                        got.br_blockcount,
+                                                        got.br_state)))
+                                                goto done;
+                                        /*
+                                         * Reset the extent record back
+                                         * to the original value.
+                                         */
+                                        xfs_bmbt_set_blockcount(ep,
+                                                got.br_blockcount);
+                                        flags = 0;
+                                        error = XFS_ERROR(ENOSPC);
+                                        goto done;
+                                }
+                                ASSERT(i == 1);
+                        } else
+                                flags |= XFS_ILOG_FEXT(whichfork);
+                        XFS_IFORK_NEXT_SET(ip, whichfork,
+                                XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+                } else {
+                        ASSERT(whichfork == XFS_DATA_FORK);
+                        temp = xfs_bmap_worst_indlen(ip, temp);
+                        xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+                        temp2 = xfs_bmap_worst_indlen(ip, temp2);
+                        new.br_startblock = NULLSTARTBLOCK((int)temp2);
+                        da_new = temp + temp2;
+                        while (da_new > da_old) {
+                                if (temp) {
+                                        temp--;
+                                        da_new--;
+                                        xfs_bmbt_set_startblock(ep,
+                                                NULLSTARTBLOCK((int)temp));
+                                }
+                                if (da_new == da_old)
+                                        break;
+                                if (temp2) {
+                                        temp2--;
+                                        da_new--;
+                                        new.br_startblock =
+                                                NULLSTARTBLOCK((int)temp2);
+                                }
+                        }
+                }
+                xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork);
+                xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL,
+                        whichfork);
+                xfs_bmap_insert_exlist(ip, idx + 1, 1, &new, whichfork);
+                ifp->if_lastex = idx + 1;
+                break;
+        }
+        /*
+         * If we need to, add to list of extents to delete.
+         */
+        if (do_fx)
+                xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+                        mp);
+        /*
+         * Adjust inode # blocks in the file.
+         */
+        if (nblks)
+                ip->i_d.di_nblocks -= nblks;
+        /*
+         * Adjust quota data.
+         */
+        if (qfield)
+                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks);
+        /*
+         * Account for change in delayed indirect blocks.
+         * Nothing to do for disk quota accounting here.
+         */
+        ASSERT(da_old >= da_new);
+        if (da_old > da_new)
+                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
+                        rsvd);
+done:
+        *logflagsp = flags;
+        return error;
+}
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+        xfs_bmap_free_t         *flist, /* free item list header */
+        xfs_bmap_free_item_t    *prev,  /* previous item on list, if any */
+        xfs_bmap_free_item_t    *free)  /* list item to be freed */
+{
+        if (prev)
+                prev->xbfi_next = free->xbfi_next;
+        else
+                flist->xbf_first = free->xbfi_next;
+        flist->xbf_count--;
+        kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+/*
+ * Remove count entries from the extents array for inode "ip", starting
+ * at index "idx".  Copies the remaining items down over the deleted ones,
+ * and gives back the excess memory.
+ */
+STATIC void
+xfs_bmap_delete_exlist(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* starting delete index */
+        xfs_extnum_t    count,          /* count of items to delete */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_t  *base;          /* base of extent list */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_extnum_t    nextents;       /* number of extents in list after */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        base = ifp->if_u1.if_extents;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - count;
+        memmove(&base[idx], &base[idx + count],
+                (nextents - idx) * sizeof(*base));
+        xfs_iext_realloc(ip, -count, whichfork);
+}
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int                                      /* error */
+xfs_bmap_extents_to_btree(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        xfs_fsblock_t           *firstblock,    /* first-block-allocated */
+        xfs_bmap_free_t         *flist,         /* blocks freed in xaction */
+        xfs_btree_cur_t         **curp,         /* cursor returned to caller */
+        int                     wasdel,         /* converting a delayed alloc */
+        int                     *logflagsp,     /* inode logging flags */
+        int                     whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_block_t        *ablock;        /* allocated (child) bt block */
+        xfs_buf_t               *abp;           /* buffer for ablock */
+        xfs_alloc_arg_t         args;           /* allocation arguments */
+        xfs_bmbt_rec_t          *arp;           /* child record pointer */
+        xfs_bmbt_block_t        *block;         /* btree root block */
+        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+        xfs_bmbt_rec_t          *ep;            /* extent list pointer */
+        int                     error;          /* error return value */
+        xfs_extnum_t            i, cnt;         /* extent list index */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        xfs_bmbt_key_t          *kp;            /* root block key pointer */
+        xfs_mount_t             *mp;            /* mount structure */
+        xfs_extnum_t            nextents;       /* extent list size */
+        xfs_bmbt_ptr_t          *pp;            /* root block address pointer */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+        ASSERT(ifp->if_ext_max ==
+               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+        /*
+         * Make space in the inode incore.
+         */
+        xfs_iroot_realloc(ip, 1, whichfork);
+        ifp->if_flags |= XFS_IFBROOT;
+        /*
+         * Fill in the root.
+         */
+        block = ifp->if_broot;
+        INT_SET(block->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+        INT_SET(block->bb_level, ARCH_CONVERT, 1);
+        INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+        INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+        INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+        /*
+         * Need a cursor.  Can't allocate until bb_level is filled in.
+         */
+        mp = ip->i_mount;
+        cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                whichfork);
+        cur->bc_private.b.firstblock = *firstblock;
+        cur->bc_private.b.flist = flist;
+        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+        /*
+         * Convert to a btree with two levels, one record in root.
+         */
+        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+        args.tp = tp;
+        args.mp = mp;
+        if (*firstblock == NULLFSBLOCK) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+        } else if (flist->xbf_low) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                args.fsbno = *firstblock;
+        } else {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                args.fsbno = *firstblock;
+        }
+        args.minlen = args.maxlen = args.prod = 1;
+        args.total = args.minleft = args.alignment = args.mod = args.isfl =
+                args.minalignslop = 0;
+        args.wasdel = wasdel;
+        *logflagsp = 0;
+        if ((error = xfs_alloc_vextent(&args))) {
+                xfs_iroot_realloc(ip, -1, whichfork);
+                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                return error;
+        }
+        /*
+         * Allocation can't fail, the space was reserved.
+         */
+        ASSERT(args.fsbno != NULLFSBLOCK);
+        ASSERT(*firstblock == NULLFSBLOCK ||
+               args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+               (flist->xbf_low &&
+                args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+        *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        ip->i_d.di_nblocks++;
+        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+        abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+        /*
+         * Fill in the child block.
+         */
+        ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+        INT_SET(ablock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+        ablock->bb_level = 0;
+        INT_SET(ablock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+        INT_SET(ablock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        for (ep = ifp->if_u1.if_extents, cnt = i = 0; i < nextents; i++, ep++) {
+                if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
+                        arp->l0 = INT_GET(ep->l0, ARCH_CONVERT);
+                        arp->l1 = INT_GET(ep->l1, ARCH_CONVERT);
+                        arp++; cnt++;
+                }
+        }
+        INT_SET(ablock->bb_numrecs, ARCH_CONVERT, cnt);
+        ASSERT(INT_GET(ablock->bb_numrecs, ARCH_CONVERT) == XFS_IFORK_NEXTENTS(ip, whichfork));
+        /*
+         * Fill in the root key and pointer.
+         */
+        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp));
+        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+        INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+        /*
+         * Do all this logging at the end so that
+         * the root is at the right level.
+         */
+        xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
+        xfs_bmbt_log_recs(cur, abp, 1, INT_GET(ablock->bb_numrecs, ARCH_CONVERT));
+        ASSERT(*curp == NULL);
+        *curp = cur;
+        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
+        return 0;
+}
+/*
+ * Insert new item(s) in the extent list for inode "ip".
+ * Count new items are inserted at offset idx.
+ */
+STATIC void
+xfs_bmap_insert_exlist(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* starting index of new items */
+        xfs_extnum_t    count,          /* number of inserted items */
+        xfs_bmbt_irec_t *new,           /* items to insert */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_t  *base;          /* extent list base */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_extnum_t    nextents;       /* extent list size */
+        xfs_extnum_t    to;             /* extent list index */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        xfs_iext_realloc(ip, count, whichfork);
+        base = ifp->if_u1.if_extents;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        memmove(&base[idx + count], &base[idx],
+                (nextents - (idx + count)) * sizeof(*base));
+        for (to = idx; to < idx + count; to++, new++)
+                xfs_bmbt_set_all(&base[to], new);
+}
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+STATIC int                              /* error */
+xfs_bmap_local_to_extents(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fsblock_t   *firstblock,    /* first block allocated in xaction */
+        xfs_extlen_t    total,          /* total blocks needed by transaction */
+        int             *logflagsp,     /* inode logging flags */
+        int             whichfork)      /* data or attr fork */
+{
+        int             error;          /* error return value */
+        int             flags;          /* logging flags returned */
+#ifdef XFS_BMAP_TRACE
+        static char     fname[] = "xfs_bmap_local_to_extents";
+#endif
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        /*
+         * We don't want to deal with the case of keeping inode data inline yet.
+         * So sending the data fork of a regular inode is invalid.
+         */
+        ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG &&
+                 whichfork == XFS_DATA_FORK));
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+        flags = 0;
+        error = 0;
+        if (ifp->if_bytes) {
+                xfs_alloc_arg_t args;   /* allocation arguments */
+                xfs_buf_t       *bp;    /* buffer for extent list block */
+                xfs_bmbt_rec_t  *ep;    /* extent list pointer */
+                args.tp = tp;
+                args.mp = ip->i_mount;
+                ASSERT(ifp->if_flags & XFS_IFINLINE);
+                /*
+                 * Allocate a block.  We know we need only one, since the
+                 * file currently fits in an inode.
+                 */
+                if (*firstblock == NULLFSBLOCK) {
+                        args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+                        args.type = XFS_ALLOCTYPE_START_BNO;
+                } else {
+                        args.fsbno = *firstblock;
+                        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                }
+                args.total = total;
+                args.mod = args.minleft = args.alignment = args.wasdel =
+                        args.isfl = args.minalignslop = 0;
+                args.minlen = args.maxlen = args.prod = 1;
+                if ((error = xfs_alloc_vextent(&args)))
+                        goto done;
+                /*
+                 * Can't fail, the space was reserved.
+                 */
+                ASSERT(args.fsbno != NULLFSBLOCK);
+                ASSERT(args.len == 1);
+                *firstblock = args.fsbno;
+                bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+                memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data,
+                        ifp->if_bytes);
+                xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+                xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+                xfs_iext_realloc(ip, 1, whichfork);
+                ep = ifp->if_u1.if_extents;
+                xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+                xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork);
+                XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+                ip->i_d.di_nblocks = 1;
+                XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
+                        XFS_TRANS_DQ_BCOUNT, 1L);
+                flags |= XFS_ILOG_FEXT(whichfork);
+        } else
+                ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+        ifp->if_flags &= ~XFS_IFINLINE;
+        ifp->if_flags |= XFS_IFEXTENTS;
+        XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+        flags |= XFS_ILOG_CORE;
+done:
+        *logflagsp = flags;
+        return error;
+}
+xfs_bmbt_rec_t *                        /* pointer to found extent entry */
+xfs_bmap_do_search_extents(
+        xfs_bmbt_rec_t  *base,          /* base of extent list */
+        xfs_extnum_t    lastx,          /* last extent index used */
+        xfs_extnum_t    nextents,       /* extent list size */
+        xfs_fileoff_t   bno,            /* block number searched for */
+        int             *eofp,          /* out: end of file found */
+        xfs_extnum_t    *lastxp,        /* out: last extent index */
+        xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+        xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+        xfs_bmbt_rec_t  *ep;            /* extent list entry pointer */
+        xfs_bmbt_irec_t got;            /* extent list entry, decoded */
+        int             high;           /* high index of binary search */
+        int             low;            /* low index of binary search */
+        /*
+         * Initialize the extent entry structure to catch access to
+         * uninitialized br_startblock field.
+         */
+        got.br_startoff = 0xffa5a5a5a5a5a5a5LL;
+        got.br_blockcount = 0xa55a5a5a5a5a5a5aLL;
+        got.br_state = XFS_EXT_INVALID;
+#if XFS_BIG_BLKNOS
+        got.br_startblock = 0xffffa5a5a5a5a5a5LL;
+#else
+        got.br_startblock = 0xffffa5a5;
+#endif
+        if (lastx != NULLEXTNUM && lastx < nextents)
+                ep = base + lastx;
+        else
+                ep = NULL;
+        prevp->br_startoff = NULLFILEOFF;
+        if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) &&
+            bno < got.br_startoff +
+                  (got.br_blockcount = xfs_bmbt_get_blockcount(ep)))
+                *eofp = 0;
+        else if (ep && lastx < nextents - 1 &&
+                 bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) &&
+                 bno < got.br_startoff +
+                       (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) {
+                lastx++;
+                ep++;
+                *eofp = 0;
+        } else if (nextents == 0)
+                *eofp = 1;
+        else if (bno == 0 &&
+                 (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) {
+                ep = base;
+                lastx = 0;
+                got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+                *eofp = 0;
+        } else {
+                /* binary search the extents array */
+                low = 0;
+                high = nextents - 1;
+                while (low <= high) {
+                        XFS_STATS_INC(xs_cmp_exlist);
+                        lastx = (low + high) >> 1;
+                        ep = base + lastx;
+                        got.br_startoff = xfs_bmbt_get_startoff(ep);
+                        got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+                        if (bno < got.br_startoff)
+                                high = lastx - 1;
+                        else if (bno >= got.br_startoff + got.br_blockcount)
+                                low = lastx + 1;
+                        else {
+                                got.br_startblock = xfs_bmbt_get_startblock(ep);
+                                got.br_state = xfs_bmbt_get_state(ep);
+                                *eofp = 0;
+                                *lastxp = lastx;
+                                *gotp = got;
+                                return ep;
+                        }
+                }
+                if (bno >= got.br_startoff + got.br_blockcount) {
+                        lastx++;
+                        if (lastx == nextents) {
+                                *eofp = 1;
+                                got.br_startblock = xfs_bmbt_get_startblock(ep);
+                                got.br_state = xfs_bmbt_get_state(ep);
+                                *prevp = got;
+                                ep = NULL;
+                        } else {
+                                *eofp = 0;
+                                xfs_bmbt_get_all(ep, prevp);
+                                ep++;
+                                got.br_startoff = xfs_bmbt_get_startoff(ep);
+                                got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+                        }
+                } else {
+                        *eofp = 0;
+                        if (ep > base)
+                                xfs_bmbt_get_all(ep - 1, prevp);
+                }
+        }
+        if (ep) {
+                got.br_startblock = xfs_bmbt_get_startblock(ep);
+                got.br_state = xfs_bmbt_get_state(ep);
+        }
+        *lastxp = lastx;
+        *gotp = got;
+        return ep;
+}
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_t *                 /* pointer to found extent entry */
+xfs_bmap_search_extents(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fileoff_t   bno,            /* block number searched for */
+        int             whichfork,      /* data or attr fork */
+        int             *eofp,          /* out: end of file found */
+        xfs_extnum_t    *lastxp,        /* out: last extent index */
+        xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
+        xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
+{
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_bmbt_rec_t  *base;          /* base of extent list */
+        xfs_extnum_t    lastx;          /* last extent index used */
+        xfs_extnum_t    nextents;       /* extent list size */
+        xfs_bmbt_rec_t  *ep;            /* extent list entry pointer */
+        int             rt;             /* realtime flag    */
+        XFS_STATS_INC(xs_look_exlist);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        lastx = ifp->if_lastex;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        base = &ifp->if_u1.if_extents[0];
+        ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp,
+                                          lastxp, gotp, prevp);
+        rt = ip->i_d.di_flags & XFS_DIFLAG_REALTIME;
+        if(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM)) {
+                cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld "
+                        "start_block : %llx start_off : %llx blkcnt : %llx "
+                        "extent-state : %x \n",
+                        (ip->i_mount)->m_fsname,(long long)ip->i_ino,
+                        gotp->br_startblock, gotp->br_startoff,
+                        gotp->br_blockcount,gotp->br_state);
+        }
+        return ep;
+}
+#ifdef XFS_BMAP_TRACE
+ktrace_t        *xfs_bmap_trace_buf;
+/*
+ * Add a bmap trace buffer entry.  Base routine for the others.
+ */
+STATIC void
+xfs_bmap_trace_addentry(
+        int             opcode,         /* operation */
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry(ies) */
+        xfs_extnum_t    cnt,            /* count of entries, 1 or 2 */
+        xfs_bmbt_rec_t  *r1,            /* first record */
+        xfs_bmbt_rec_t  *r2,            /* second record or null */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_t  tr2;
+        ASSERT(cnt == 1 || cnt == 2);
+        ASSERT(r1 != NULL);
+        if (cnt == 1) {
+                ASSERT(r2 == NULL);
+                r2 = &tr2;
+                memset(&tr2, 0, sizeof(tr2));
+        } else
+                ASSERT(r2 != NULL);
+        ktrace_enter(xfs_bmap_trace_buf,
+                (void *)(__psint_t)(opcode | (whichfork << 16)),
+                (void *)fname, (void *)desc, (void *)ip,
+                (void *)(__psint_t)idx,
+                (void *)(__psint_t)cnt,
+                (void *)(__psunsigned_t)(ip->i_ino >> 32),
+                (void *)(__psunsigned_t)(unsigned)ip->i_ino,
+                (void *)(__psunsigned_t)(r1->l0 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r1->l0),
+                (void *)(__psunsigned_t)(r1->l1 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r1->l1),
+                (void *)(__psunsigned_t)(r2->l0 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r2->l0),
+                (void *)(__psunsigned_t)(r2->l1 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r2->l1)
+                );
+        ASSERT(ip->i_xtrace);
+        ktrace_enter(ip->i_xtrace,
+                (void *)(__psint_t)(opcode | (whichfork << 16)),
+                (void *)fname, (void *)desc, (void *)ip,
+                (void *)(__psint_t)idx,
+                (void *)(__psint_t)cnt,
+                (void *)(__psunsigned_t)(ip->i_ino >> 32),
+                (void *)(__psunsigned_t)(unsigned)ip->i_ino,
+                (void *)(__psunsigned_t)(r1->l0 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r1->l0),
+                (void *)(__psunsigned_t)(r1->l1 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r1->l1),
+                (void *)(__psunsigned_t)(r2->l0 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r2->l0),
+                (void *)(__psunsigned_t)(r2->l1 >> 32),
+                (void *)(__psunsigned_t)(unsigned)(r2->l1)
+                );
+}
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
+ */
+STATIC void
+xfs_bmap_trace_delete(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry(entries) deleted */
+        xfs_extnum_t    cnt,            /* count of entries deleted, 1 or 2 */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
+                cnt, &ifp->if_u1.if_extents[idx],
+                cnt == 2 ? &ifp->if_u1.if_extents[idx + 1] : NULL,
+                whichfork);
+}
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
+ * reading in the extents list from the disk (in the btree).
+ */
+STATIC void
+xfs_bmap_trace_insert(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry(entries) inserted */
+        xfs_extnum_t    cnt,            /* count of entries inserted, 1 or 2 */
+        xfs_bmbt_irec_t *r1,            /* inserted record 1 */
+        xfs_bmbt_irec_t *r2,            /* inserted record 2 or null */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_t  tr1;            /* compressed record 1 */
+        xfs_bmbt_rec_t  tr2;            /* compressed record 2 if needed */
+        xfs_bmbt_set_all(&tr1, r1);
+        if (cnt == 2) {
+                ASSERT(r2 != NULL);
+                xfs_bmbt_set_all(&tr2, r2);
+        } else {
+                ASSERT(cnt == 1);
+                ASSERT(r2 == NULL);
+        }
+        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
+                cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
+}
+/*
+ * Add bmap trace entry after updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_post_update(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry updated */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
+                1, &ifp->if_u1.if_extents[idx], NULL, whichfork);
+}
+/*
+ * Add bmap trace entry prior to updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_pre_update(
+        char            *fname,         /* function name */
+        char            *desc,          /* operation description */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    idx,            /* index of entry to be updated */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
+                &ifp->if_u1.if_extents[idx], NULL, whichfork);
+}
+#endif  /* XFS_BMAP_TRACE */
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_filblks_t   len)            /* delayed extent length */
+{
+        int             level;          /* btree level number */
+        int             maxrecs;        /* maximum record count at this level */
+        xfs_mount_t     *mp;            /* mount structure */
+        xfs_filblks_t   rval;           /* return value */
+        mp = ip->i_mount;
+        maxrecs = mp->m_bmap_dmxr[0];
+        for (level = 0, rval = 0;
+             level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+             level++) {
+                len += maxrecs - 1;
+                do_div(len, maxrecs);
+                rval += len;
+                if (len == 1)
+                        return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+                                level - 1;
+                if (level == 0)
+                        maxrecs = mp->m_bmap_dmxr[1];
+        }
+        return rval;
+}
+#if defined(XFS_RW_TRACE)
+STATIC void
+xfs_bunmap_trace(
+        xfs_inode_t             *ip,
+        xfs_fileoff_t           bno,
+        xfs_filblks_t           len,
+        int                     flags,
+        inst_t                  *ra)
+{
+        if (ip->i_rwtrace == NULL)
+                return;
+        ktrace_enter(ip->i_rwtrace,
+                (void *)(__psint_t)XFS_BUNMAPI,
+                (void *)ip,
+                (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
+                (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
+                (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
+                (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
+                (void *)(__psint_t)len,
+                (void *)(__psint_t)flags,
+                (void *)(unsigned long)current_cpu(),
+                (void *)ra,
+                (void *)0,
+                (void *)0,
+                (void *)0,
+                (void *)0,
+                (void *)0,
+                (void *)0);
+}
+#endif
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int                                             /* error code */
+xfs_bmap_add_attrfork(
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        int                     rsvd)           /* OK to allocated reserved blocks in trans */
+{
+        int                     blks;           /* space reservation */
+        int                     committed;      /* xaction was committed */
+        int                     error;          /* error return value */
+        xfs_fsblock_t           firstblock;     /* 1st block/ag allocated */
+        xfs_bmap_free_t         flist;          /* freed extent list */
+        int                     logflags;       /* logging flags */
+        xfs_mount_t             *mp;            /* mount structure */
+        unsigned long           s;              /* spinlock spl value */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        ASSERT(ip->i_df.if_ext_max ==
+               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+        if (XFS_IFORK_Q(ip))
+                return 0;
+        mp = ip->i_mount;
+        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+        tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+        blks = XFS_ADDAFORK_SPACE_RES(mp);
+        if (rsvd)
+                tp->t_flags |= XFS_TRANS_RESERVE;
+        if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+                goto error0;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ?
+                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+                        XFS_QMOPT_RES_REGBLKS);
+        if (error) {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+                return error;
+        }
+        if (XFS_IFORK_Q(ip))
+                goto error1;
+        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+                /*
+                 * For inodes coming from pre-6.2 filesystems.
+                 */
+                ASSERT(ip->i_d.di_aformat == 0);
+                ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+        }
+        ASSERT(ip->i_d.di_anextents == 0);
+        VN_HOLD(XFS_ITOV(ip));
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        switch (ip->i_d.di_format) {
+        case XFS_DINODE_FMT_DEV:
+                ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+                break;
+        case XFS_DINODE_FMT_UUID:
+                ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+        case XFS_DINODE_FMT_EXTENTS:
+        case XFS_DINODE_FMT_BTREE:
+                ip->i_d.di_forkoff = mp->m_attroffset >> 3;
+                break;
+        default:
+                ASSERT(0);
+                error = XFS_ERROR(EINVAL);
+                goto error1;
+        }
+        ip->i_df.if_ext_max =
+                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+        ASSERT(ip->i_afp == NULL);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+        ip->i_afp->if_ext_max =
+                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+        ip->i_afp->if_flags = XFS_IFEXTENTS;
+        logflags = 0;
+        XFS_BMAP_INIT(&flist, &firstblock);
+        switch (ip->i_d.di_format) {
+        case XFS_DINODE_FMT_LOCAL:
+                error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+                        &logflags);
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+                error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+                        &flist, &logflags);
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+                        &logflags);
+                break;
+        default:
+                error = 0;
+                break;
+        }
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
+        if (error)
+                goto error2;
+        if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
+                s = XFS_SB_LOCK(mp);
+                if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
+                        XFS_SB_VERSION_ADDATTR(&mp->m_sb);
+                        XFS_SB_UNLOCK(mp, s);
+                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+                } else
+                        XFS_SB_UNLOCK(mp, s);
+        }
+        if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed)))
+                goto error2;
+        error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL);
+        ASSERT(ip->i_df.if_ext_max ==
+               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+        return error;
+error2:
+        xfs_bmap_cancel(&flist);
+error1:
+        ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE));
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+        ASSERT(ip->i_df.if_ext_max ==
+               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+        return error;
+}
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+/* ARGSUSED */
+void
+xfs_bmap_add_free(
+        xfs_fsblock_t           bno,            /* fs block number of extent */
+        xfs_filblks_t           len,            /* length of extent */
+        xfs_bmap_free_t         *flist,         /* list of extents */
+        xfs_mount_t             *mp)            /* mount point structure */
+{
+        xfs_bmap_free_item_t    *cur;           /* current (next) element */
+        xfs_bmap_free_item_t    *new;           /* new element */
+        xfs_bmap_free_item_t    *prev;          /* previous element */
+#ifdef DEBUG
+        xfs_agnumber_t          agno;
+        xfs_agblock_t           agbno;
+        ASSERT(bno != NULLFSBLOCK);
+        ASSERT(len > 0);
+        ASSERT(len <= MAXEXTLEN);
+        ASSERT(!ISNULLSTARTBLOCK(bno));
+        agno = XFS_FSB_TO_AGNO(mp, bno);
+        agbno = XFS_FSB_TO_AGBNO(mp, bno);
+        ASSERT(agno < mp->m_sb.sb_agcount);
+        ASSERT(agbno < mp->m_sb.sb_agblocks);
+        ASSERT(len < mp->m_sb.sb_agblocks);
+        ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+        ASSERT(xfs_bmap_free_item_zone != NULL);
+        new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+        new->xbfi_startblock = bno;
+        new->xbfi_blockcount = (xfs_extlen_t)len;
+        for (prev = NULL, cur = flist->xbf_first;
+             cur != NULL;
+             prev = cur, cur = cur->xbfi_next) {
+                if (cur->xbfi_startblock >= bno)
+                        break;
+        }
+        if (prev)
+                prev->xbfi_next = new;
+        else
+                flist->xbf_first = new;
+        new->xbfi_next = cur;
+        flist->xbf_count++;
+}
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        int             whichfork)      /* data or attr fork */
+{
+        int             level;          /* btree level */
+        uint            maxblocks;      /* max blocks at this level */
+        uint            maxleafents;    /* max leaf entries possible */
+        int             maxrootrecs;    /* max records in root block */
+        int             minleafrecs;    /* min records in leaf block */
+        int             minnoderecs;    /* min records in node block */
+        int             sz;             /* root block size */
+        /*
+         * The maximum number of extents in a file, hence the maximum
+         * number of leaf entries, is controlled by the type of di_nextents
+         * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+         * (a signed 16-bit number, xfs_aextnum_t).
+         */
+        maxleafents = (whichfork == XFS_DATA_FORK) ? MAXEXTNUM : MAXAEXTNUM;
+        minleafrecs = mp->m_bmap_dmnr[0];
+        minnoderecs = mp->m_bmap_dmnr[1];
+        sz = (whichfork == XFS_DATA_FORK) ?
+                mp->m_attroffset :
+                mp->m_sb.sb_inodesize - mp->m_attroffset;
+        maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+        for (level = 1; maxblocks > 1; level++) {
+                if (maxblocks <= maxrootrecs)
+                        maxblocks = 1;
+                else
+                        maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+        }
+        mp->m_bm_maxlevels[whichfork] = level;
+}
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.  We never free any extents in
+ * the first transaction.  This is to allow the caller to make the first
+ * transaction a synchronous one so that the pointers to the data being
+ * broken in this transaction will be permanent before the data is actually
+ * freed.  This is necessary to prevent blocks from being reallocated
+ * and written to before the free and reallocation are actually permanent.
+ * We do not just make the first transaction synchronous here, because
+ * there are more efficient ways to gain the same protection in some cases
+ * (see the file truncation code).
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+/*ARGSUSED*/
+int                                             /* error */
+xfs_bmap_finish(
+        xfs_trans_t             **tp,           /* transaction pointer addr */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        xfs_fsblock_t           firstblock,     /* controlled ag for allocs */
+        int                     *committed)     /* xact committed or not */
+{
+        xfs_efd_log_item_t      *efd;           /* extent free data */
+        xfs_efi_log_item_t      *efi;           /* extent free intention */
+        int                     error;          /* error return value */
+        xfs_bmap_free_item_t    *free;          /* free extent list item */
+        unsigned int            logres;         /* new log reservation */
+        unsigned int            logcount;       /* new log count */
+        xfs_mount_t             *mp;            /* filesystem mount structure */
+        xfs_bmap_free_item_t    *next;          /* next item on free list */
+        xfs_trans_t             *ntp;           /* new transaction pointer */
+        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+        if (flist->xbf_count == 0) {
+                *committed = 0;
+                return 0;
+        }
+        ntp = *tp;
+        efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+        for (free = flist->xbf_first; free; free = free->xbfi_next)
+                xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+                        free->xbfi_blockcount);
+        logres = ntp->t_log_res;
+        logcount = ntp->t_log_count;
+        ntp = xfs_trans_dup(*tp);
+        error = xfs_trans_commit(*tp, 0, NULL);
+        *tp = ntp;
+        *committed = 1;
+        /*
+         * We have a new transaction, so we should return committed=1,
+         * even though we're returning an error.
+         */
+        if (error) {
+                return error;
+        }
+        if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
+                        logcount)))
+                return error;
+        efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+        for (free = flist->xbf_first; free != NULL; free = next) {
+                next = free->xbfi_next;
+                if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+                                free->xbfi_blockcount))) {
+                        /*
+                         * The bmap free list will be cleaned up at a
+                         * higher level.  The EFI will be canceled when
+                         * this transaction is aborted.
+                         * Need to force shutdown here to make sure it
+                         * happens, since this transaction may not be
+                         * dirty yet.
+                         */
+                        mp = ntp->t_mountp;
+                        if (!XFS_FORCED_SHUTDOWN(mp))
+                                xfs_force_shutdown(mp,
+                                                   (error == EFSCORRUPTED) ?
+                                                   XFS_CORRUPT_INCORE :
+                                                   XFS_METADATA_IO_ERROR);
+                        return error;
+                }
+                xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+                        free->xbfi_blockcount);
+                xfs_bmap_del_free(flist, NULL, free);
+        }
+        return 0;
+}
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+        xfs_bmap_free_t         *flist) /* list of bmap_free_items */
+{
+        xfs_bmap_free_item_t    *free;  /* free list item */
+        xfs_bmap_free_item_t    *next;
+        if (flist->xbf_count == 0)
+                return;
+        ASSERT(flist->xbf_first != NULL);
+        for (free = flist->xbf_first; free; free = next) {
+                next = free->xbfi_next;
+                xfs_bmap_del_free(flist, NULL, free);
+        }
+        ASSERT(flist->xbf_count == 0);
+}
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int                                             /* error */
+xfs_bmap_first_unused(
+        xfs_trans_t     *tp,                    /* transaction pointer */
+        xfs_inode_t     *ip,                    /* incore inode */
+        xfs_extlen_t    len,                    /* size of hole to find */
+        xfs_fileoff_t   *first_unused,          /* unused block */
+        int             whichfork)              /* data or attr fork */
+{
+        xfs_bmbt_rec_t  *base;                  /* base of extent array */
+        xfs_bmbt_rec_t  *ep;                    /* pointer to an extent entry */
+        int             error;                  /* error return value */
+        xfs_ifork_t     *ifp;                   /* inode fork pointer */
+        xfs_fileoff_t   lastaddr;               /* last block number seen */
+        xfs_fileoff_t   lowest;                 /* lowest useful block */
+        xfs_fileoff_t   max;                    /* starting useful block */
+        xfs_fileoff_t   off;                    /* offset for this block */
+        xfs_extnum_t    nextents;               /* number of extent entries */
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+               XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+               XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                *first_unused = 0;
+                return 0;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        lowest = *first_unused;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        base = &ifp->if_u1.if_extents[0];
+        for (lastaddr = 0, max = lowest, ep = base;
+             ep < &base[nextents];
+             ep++) {
+                off = xfs_bmbt_get_startoff(ep);
+                /*
+                 * See if the hole before this extent will work.
+                 */
+                if (off >= lowest + len && off - max >= len) {
+                        *first_unused = max;
+                        return 0;
+                }
+                lastaddr = off + xfs_bmbt_get_blockcount(ep);
+                max = XFS_FILEOFF_MAX(lastaddr, lowest);
+        }
+        *first_unused = max;
+        return 0;
+}
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int                                             /* error */
+xfs_bmap_last_before(
+        xfs_trans_t     *tp,                    /* transaction pointer */
+        xfs_inode_t     *ip,                    /* incore inode */
+        xfs_fileoff_t   *last_block,            /* last block */
+        int             whichfork)              /* data or attr fork */
+{
+        xfs_fileoff_t   bno;                    /* input file offset */
+        int             eof;                    /* hit end of file */
+        xfs_bmbt_rec_t  *ep;                    /* pointer to last extent */
+        int             error;                  /* error return value */
+        xfs_bmbt_irec_t got;                    /* current extent value */
+        xfs_ifork_t     *ifp;                   /* inode fork pointer */
+        xfs_extnum_t    lastx;                  /* last extent used */
+        xfs_bmbt_irec_t prev;                   /* previous extent value */
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+               return XFS_ERROR(EIO);
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                *last_block = 0;
+                return 0;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        bno = *last_block - 1;
+        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+                &prev);
+        if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+                if (prev.br_startoff == NULLFILEOFF)
+                        *last_block = 0;
+                else
+                        *last_block = prev.br_startoff + prev.br_blockcount;
+        }
+        /*
+         * Otherwise *last_block is already the right answer.
+         */
+        return 0;
+}
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int                                             /* error */
+xfs_bmap_last_offset(
+        xfs_trans_t     *tp,                    /* transaction pointer */
+        xfs_inode_t     *ip,                    /* incore inode */
+        xfs_fileoff_t   *last_block,            /* last block */
+        int             whichfork)              /* data or attr fork */
+{
+        xfs_bmbt_rec_t  *base;                  /* base of extent array */
+        xfs_bmbt_rec_t  *ep;                    /* pointer to last extent */
+        int             error;                  /* error return value */
+        xfs_ifork_t     *ifp;                   /* inode fork pointer */
+        xfs_extnum_t    nextents;               /* number of extent entries */
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+               return XFS_ERROR(EIO);
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                *last_block = 0;
+                return 0;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (!nextents) {
+                *last_block = 0;
+                return 0;
+        }
+        base = &ifp->if_u1.if_extents[0];
+        ASSERT(base != NULL);
+        ep = &base[nextents - 1];
+        *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+        return 0;
+}
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int                                     /* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+        xfs_inode_t     *ip,            /* incore inode */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_t  *ep;            /* ptr to fork's extent */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        int             rval;           /* return value */
+        xfs_bmbt_irec_t s;              /* internal version of extent */
+#ifndef DEBUG
+        if (whichfork == XFS_DATA_FORK)
+                return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize;
+#endif  /* !DEBUG */
+        if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+                return 0;
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+                return 0;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        ep = ifp->if_u1.if_extents;
+        xfs_bmbt_get_all(ep, &s);
+        rval = s.br_startoff == 0 && s.br_blockcount == 1;
+        if (rval && whichfork == XFS_DATA_FORK)
+                ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+        return rval;
+}
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int                                     /* error */
+xfs_bmap_read_extents(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_inode_t             *ip,    /* incore inode */
+        int                     whichfork) /* data or attr fork */
+{
+        xfs_bmbt_block_t        *block; /* current btree block */
+        xfs_fsblock_t           bno;    /* block # of "block" */
+        xfs_buf_t               *bp;    /* buffer for "block" */
+        int                     error;  /* error return value */
+        xfs_exntfmt_t           exntf;  /* XFS_EXTFMT_NOSTATE, if checking */
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_bmap_read_extents";
+#endif
+        xfs_extnum_t            i, j;   /* index into the extents list */
+        xfs_ifork_t             *ifp;   /* fork structure */
+        int                     level;  /* btree level, for checking */
+        xfs_mount_t             *mp;    /* file system mount structure */
+        xfs_bmbt_ptr_t          *pp;    /* pointer to block address */
+        /* REFERENCED */
+        xfs_extnum_t            room;   /* number of entries there's room for */
+        xfs_bmbt_rec_t          *trp;   /* target record pointer */
+        bno = NULLFSBLOCK;
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+                                        XFS_EXTFMT_INODE(ip);
+        block = ifp->if_broot;
+        /*
+         * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+         */
+        ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+        level = INT_GET(block->bb_level, ARCH_CONVERT);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+        ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+        bno = INT_GET(*pp, ARCH_CONVERT);
+        /*
+         * Go down the tree until leaf level is reached, following the first
+         * pointer (leftmost) at each level.
+         */
+        while (level-- > 0) {
+                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                XFS_BMAP_BTREE_REF)))
+                        return error;
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                XFS_WANT_CORRUPTED_GOTO(
+                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        error0);
+                if (level == 0)
+                        break;
+                pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
+                        1, mp->m_bmap_dmxr[1]);
+                XFS_WANT_CORRUPTED_GOTO(
+                        XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)),
+                        error0);
+                bno = INT_GET(*pp, ARCH_CONVERT);
+                xfs_trans_brelse(tp, bp);
+        }
+        /*
+         * Here with bp and block set to the leftmost leaf node in the tree.
+         */
+        room = ifp->if_bytes / (uint)sizeof(*trp);
+        trp = ifp->if_u1.if_extents;
+        i = 0;
+        /*
+         * Loop over all leaf nodes.  Copy information to the extent list.
+         */
+        for (;;) {
+                xfs_bmbt_rec_t  *frp, *temp;
+                xfs_fsblock_t   nextbno;
+                xfs_extnum_t    num_recs;
+                num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+                if (unlikely(i + num_recs > room)) {
+                        ASSERT(i + num_recs <= room);
+                        xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                                "corrupt dinode %Lu, (btree extents).  Unmount and run xfs_repair.",
+                                (unsigned long long) ip->i_ino);
+                        XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
+                                         XFS_ERRLEVEL_LOW,
+                                        ip->i_mount);
+                        goto error0;
+                }
+                XFS_WANT_CORRUPTED_GOTO(
+                        XFS_BMAP_SANITY_CHECK(mp, block, 0),
+                        error0);
+                /*
+                 * Read-ahead the next leaf block, if any.
+                 */
+                nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+                if (nextbno != NULLFSBLOCK)
+                        xfs_btree_reada_bufl(mp, nextbno, 1);
+                /*
+                 * Copy records into the extent list.
+                 */
+                frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
+                        block, 1, mp->m_bmap_dmxr[0]);
+                temp = trp;
+                for (j = 0; j < num_recs; j++, frp++, trp++) {
+                        trp->l0 = INT_GET(frp->l0, ARCH_CONVERT);
+                        trp->l1 = INT_GET(frp->l1, ARCH_CONVERT);
+                }
+                if (exntf == XFS_EXTFMT_NOSTATE) {
+                        /*
+                         * Check all attribute bmap btree records and
+                         * any "older" data bmap btree records for a
+                         * set bit in the "extent flag" position.
+                         */
+                        if (unlikely(xfs_check_nostate_extents(temp, num_recs))) {
+                                XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
+                                                 XFS_ERRLEVEL_LOW,
+                                                 ip->i_mount);
+                                goto error0;
+                        }
+                }
+                i += num_recs;
+                xfs_trans_brelse(tp, bp);
+                bno = nextbno;
+                /*
+                 * If we've reached the end, stop.
+                 */
+                if (bno == NULLFSBLOCK)
+                        break;
+                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                XFS_BMAP_BTREE_REF)))
+                        return error;
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+        }
+        ASSERT(i == ifp->if_bytes / (uint)sizeof(*trp));
+        ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+        xfs_bmap_trace_exlist(fname, ip, i, whichfork);
+        return 0;
+error0:
+        xfs_trans_brelse(tp, bp);
+        return XFS_ERROR(EFSCORRUPTED);
+}
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ */
+void
+xfs_bmap_trace_exlist(
+        char            *fname,         /* function name */
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_extnum_t    cnt,            /* count of entries in the list */
+        int             whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_t  *base;          /* base of extent list */
+        xfs_bmbt_rec_t  *ep;            /* current entry in extent list */
+        xfs_extnum_t    idx;            /* extent list entry number */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_bmbt_irec_t s;              /* extent list record */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(cnt == ifp->if_bytes / (uint)sizeof(*base));
+        base = ifp->if_u1.if_extents;
+        for (idx = 0, ep = base; idx < cnt; idx++, ep++) {
+                xfs_bmbt_get_all(ep, &s);
+                xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL,
+                        whichfork);
+        }
+}
+#endif
+#ifdef DEBUG
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the callers original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extent beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+        xfs_fileoff_t           bno,
+        xfs_filblks_t           len,
+        int                     flags,
+        xfs_bmbt_irec_t         *mval,
+        int                     nmap,
+        int                     ret_nmap)
+{
+        int                     i;              /* index to map values */
+        ASSERT(ret_nmap <= nmap);
+        for (i = 0; i < ret_nmap; i++) {
+                ASSERT(mval[i].br_blockcount > 0);
+                if (!(flags & XFS_BMAPI_ENTIRE)) {
+                        ASSERT(mval[i].br_startoff >= bno);
+                        ASSERT(mval[i].br_blockcount <= len);
+                        ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+                               bno + len);
+                } else {
+                        ASSERT(mval[i].br_startoff < bno + len);
+                        ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+                               bno);
+                }
+                ASSERT(i == 0 ||
+                       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+                       mval[i].br_startoff);
+                if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
+                        ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+                               mval[i].br_startblock != HOLESTARTBLOCK);
+                ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+                       mval[i].br_state == XFS_EXT_UNWRITTEN);
+        }
+}
+#endif /* DEBUG */
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int                                     /* error */
+xfs_bmapi(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *ip,            /* incore inode */
+        xfs_fileoff_t   bno,            /* starting file offs. mapped */
+        xfs_filblks_t   len,            /* length to map in file */
+        int             flags,          /* XFS_BMAPI_... */
+        xfs_fsblock_t   *firstblock,    /* first allocated block
+                                           controls a.g. for allocs */
+        xfs_extlen_t    total,          /* total blocks needed */
+        xfs_bmbt_irec_t *mval,          /* output: map values */
+        int             *nmap,          /* i/o: mval size/count */
+        xfs_bmap_free_t *flist)         /* i/o: list extents to free */
+{
+        xfs_fsblock_t   abno;           /* allocated block number */
+        xfs_extlen_t    alen;           /* allocated extent length */
+        xfs_fileoff_t   aoff;           /* allocated file offset */
+        xfs_bmalloca_t  bma;            /* args for xfs_bmap_alloc */
+        char            contig;         /* allocation must be one extent */
+        xfs_btree_cur_t *cur;           /* bmap btree cursor */
+        char            delay;          /* this request is for delayed alloc */
+        xfs_fileoff_t   end;            /* end of mapped file region */
+        int             eof;            /* we've hit the end of extent list */
+        xfs_bmbt_rec_t  *ep;            /* extent list entry pointer */
+        int             error;          /* error return */
+        char            exact;          /* don't do all of wasdelayed extent */
+        xfs_bmbt_irec_t got;            /* current extent list record */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_extlen_t    indlen;         /* indirect blocks length */
+        char            inhole;         /* current location is hole in file */
+        xfs_extnum_t    lastx;          /* last useful extent number */
+        int             logflags;       /* flags for transaction logging */
+        xfs_extlen_t    minleft;        /* min blocks left after allocation */
+        xfs_extlen_t    minlen;         /* min allocation size */
+        xfs_mount_t     *mp;            /* xfs mount structure */
+        int             n;              /* current extent index */
+        int             nallocs;        /* number of extents alloc\'d */
+        xfs_extnum_t    nextents;       /* number of extents in file */
+        xfs_fileoff_t   obno;           /* old block number (offset) */
+        xfs_bmbt_irec_t prev;           /* previous extent list record */
+        char            stateless;      /* ignore state flag set */
+        int             tmp_logflags;   /* temp flags holder */
+        char            trim;           /* output trimmed to match range */
+        char            userdata;       /* allocating non-metadata */
+        char            wasdelay;       /* old extent was delayed */
+        int             whichfork;      /* data or attr fork */
+        char            wr;             /* this is a write request */
+        char            rsvd;           /* OK to allocate reserved blocks */
+#ifdef DEBUG
+        xfs_fileoff_t   orig_bno;       /* original block number value */
+        int             orig_flags;     /* original flags arg value */
+        xfs_filblks_t   orig_len;       /* original value of len arg */
+        xfs_bmbt_irec_t *orig_mval;     /* original value of mval */
+        int             orig_nmap;      /* original value of *nmap */
+        orig_bno = bno;
+        orig_len = len;
+        orig_flags = flags;
+        orig_mval = mval;
+        orig_nmap = *nmap;
+#endif
+        ASSERT(*nmap >= 1);
+        ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                XFS_ATTR_FORK : XFS_DATA_FORK;
+        mp = ip->i_mount;
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_ext_max ==
+               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+        if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
+                XFS_STATS_INC(xs_blk_mapw);
+        else
+                XFS_STATS_INC(xs_blk_mapr);
+        delay = (flags & XFS_BMAPI_DELAY) != 0;
+        trim = (flags & XFS_BMAPI_ENTIRE) == 0;
+        userdata = (flags & XFS_BMAPI_METADATA) == 0;
+        exact = (flags & XFS_BMAPI_EXACT) != 0;
+        rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+        contig = (flags & XFS_BMAPI_CONTIG) != 0;
+        /*
+         * stateless is used to combine extents which
+         * differ only due to the state of the extents.
+         * This technique is used from xfs_getbmap()
+         * when the caller does not wish to see the
+         * separation (which is the default).
+         *
+         * This technique is also used when writing a
+         * buffer which has been partially written,
+         * (usually by being flushed during a chunkread),
+         * to ensure one write takes place. This also
+         * prevents a change in the xfs inode extents at
+         * this time, intentionally. This change occurs
+         * on completion of the write operation, in
+         * xfs_strat_comp(), where the xfs_bmapi() call
+         * is transactioned, and the extents combined.
+         */
+        stateless = (flags & XFS_BMAPI_IGSTATE) != 0;
+        if (stateless && wr)    /* if writing unwritten space, no */
+                wr = 0;         /* allocations are allowed */
+        ASSERT(wr || !delay);
+        logflags = 0;
+        nallocs = 0;
+        cur = NULL;
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                ASSERT(wr && tp);
+                if ((error = xfs_bmap_local_to_extents(tp, ip,
+                                firstblock, total, &logflags, whichfork)))
+                        goto error0;
+        }
+        if (wr && *firstblock == NULLFSBLOCK) {
+                if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+                        minleft = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
+                else
+                        minleft = 1;
+        } else
+                minleft = 0;
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                goto error0;
+        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+                &prev);
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        n = 0;
+        end = bno + len;
+        obno = bno;
+        bma.ip = NULL;
+        while (bno < end && n < *nmap) {
+                /*
+                 * Reading past eof, act as though there's a hole
+                 * up to end.
+                 */
+                if (eof && !wr)
+                        got.br_startoff = end;
+                inhole = eof || got.br_startoff > bno;
+                wasdelay = wr && !inhole && !delay &&
+                        ISNULLSTARTBLOCK(got.br_startblock);
+                /*
+                 * First, deal with the hole before the allocated space
+                 * that we found, if any.
+                 */
+                if (wr && (inhole || wasdelay)) {
+                        /*
+                         * For the wasdelay case, we could also just
+                         * allocate the stuff asked for in this bmap call
+                         * but that wouldn't be as good.
+                         */
+                        if (wasdelay && !exact) {
+                                alen = (xfs_extlen_t)got.br_blockcount;
+                                aoff = got.br_startoff;
+                                if (lastx != NULLEXTNUM && lastx) {
+                                        ep = &ifp->if_u1.if_extents[lastx - 1];
+                                        xfs_bmbt_get_all(ep, &prev);
+                                }
+                        } else if (wasdelay) {
+                                alen = (xfs_extlen_t)
+                                        XFS_FILBLKS_MIN(len,
+                                                (got.br_startoff +
+                                                 got.br_blockcount) - bno);
+                                aoff = bno;
+                        } else {
+                                alen = (xfs_extlen_t)
+                                        XFS_FILBLKS_MIN(len, MAXEXTLEN);
+                                if (!eof)
+                                        alen = (xfs_extlen_t)
+                                                XFS_FILBLKS_MIN(alen,
+                                                        got.br_startoff - bno);
+                                aoff = bno;
+                        }
+                        minlen = contig ? alen : 1;
+                        if (delay) {
+                                indlen = (xfs_extlen_t)
+                                        xfs_bmap_worst_indlen(ip, alen);
+                                ASSERT(indlen > 0);
+                                /*
+                                 * Make a transaction-less quota reservation for
+                                 * delayed allocation blocks. This number gets
+                                 * adjusted later.
+                                 * We return EDQUOT if we haven't allocated
+                                 * blks already inside this loop;
+                                 */
+                                if (XFS_TRANS_RESERVE_BLKQUOTA(
+                                                mp, NULL, ip, (long)alen)) {
+                                        if (n == 0) {
+                                                *nmap = 0;
+                                                ASSERT(cur == NULL);
+                                                return XFS_ERROR(EDQUOT);
+                                        }
+                                        break;
+                                }
+                                /*
+                                 * Split changing sb for alen and indlen since
+                                 * they could be coming from different places.
+                                 */
+                                if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+                                        xfs_extlen_t    extsz;
+                                        xfs_extlen_t    ralen;
+                                        if (!(extsz = ip->i_d.di_extsize))
+                                                extsz = mp->m_sb.sb_rextsize;
+                                        ralen = roundup(alen, extsz);
+                                        ralen = ralen / mp->m_sb.sb_rextsize;
+                                        if (xfs_mod_incore_sb(mp,
+                                                XFS_SBS_FREXTENTS,
+                                                -(ralen), rsvd)) {
+                                                if (XFS_IS_QUOTA_ON(ip->i_mount))
+                                                        XFS_TRANS_UNRESERVE_BLKQUOTA(
+                                                                mp, NULL, ip,
+                                                                (long)alen);
+                                                break;
+                                        }
+                                } else {
+                                        if (xfs_mod_incore_sb(mp,
+                                                              XFS_SBS_FDBLOCKS,
+                                                              -(alen), rsvd)) {
+                                                if (XFS_IS_QUOTA_ON(ip->i_mount))
+                                                        XFS_TRANS_UNRESERVE_BLKQUOTA(
+                                                                mp, NULL, ip,
+                                                                (long)alen);
+                                                break;
+                                        }
+                                }
+                                if (xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+                                                -(indlen), rsvd)) {
+                                        XFS_TRANS_UNRESERVE_BLKQUOTA(
+                                                mp, NULL, ip, (long)alen);
+                                        break;
+                                }
+                                ip->i_delayed_blks += alen;
+                                abno = NULLSTARTBLOCK(indlen);
+                        } else {
+                                /*
+                                 * If first time, allocate and fill in
+                                 * once-only bma fields.
+                                 */
+                                if (bma.ip == NULL) {
+                                        bma.tp = tp;
+                                        bma.ip = ip;
+                                        bma.prevp = &prev;
+                                        bma.gotp = &got;
+                                        bma.total = total;
+                                        bma.userdata = 0;
+                                }
+                                /* Indicate if this is the first user data
+                                 * in the file, or just any user data.
+                                 */
+                                if (userdata) {
+                                        bma.userdata = (aoff == 0) ?
+                                                XFS_ALLOC_INITIAL_USER_DATA :
+                                                XFS_ALLOC_USERDATA;
+                                }
+                                /*
+                                 * Fill in changeable bma fields.
+                                 */
+                                bma.eof = eof;
+                                bma.firstblock = *firstblock;
+                                bma.alen = alen;
+                                bma.off = aoff;
+                                bma.wasdel = wasdelay;
+                                bma.minlen = minlen;
+                                bma.low = flist->xbf_low;
+                                bma.minleft = minleft;
+                                /*
+                                 * Only want to do the alignment at the
+                                 * eof if it is userdata and allocation length
+                                 * is larger than a stripe unit.
+                                 */
+                                if (mp->m_dalign && alen >= mp->m_dalign &&
+                                    userdata && whichfork == XFS_DATA_FORK) {
+                                        if ((error = xfs_bmap_isaeof(ip, aoff,
+                                                        whichfork, &bma.aeof)))
+                                                goto error0;
+                                } else
+                                        bma.aeof = 0;
+                                /*
+                                 * Call allocator.
+                                 */
+                                if ((error = xfs_bmap_alloc(&bma)))
+                                        goto error0;
+                                /*
+                                 * Copy out result fields.
+                                 */
+                                abno = bma.rval;
+                                if ((flist->xbf_low = bma.low))
+                                        minleft = 0;
+                                alen = bma.alen;
+                                aoff = bma.off;
+                                ASSERT(*firstblock == NULLFSBLOCK ||
+                                       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+                                       XFS_FSB_TO_AGNO(mp, bma.firstblock) ||
+                                       (flist->xbf_low &&
+                                        XFS_FSB_TO_AGNO(mp, *firstblock) <
+                                        XFS_FSB_TO_AGNO(mp, bma.firstblock)));
+                                *firstblock = bma.firstblock;
+                                if (cur)
+                                        cur->bc_private.b.firstblock =
+                                                *firstblock;
+                                if (abno == NULLFSBLOCK)
+                                        break;
+                                if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+                                        cur = xfs_btree_init_cursor(mp,
+                                                tp, NULL, 0, XFS_BTNUM_BMAP,
+                                                ip, whichfork);
+                                        cur->bc_private.b.firstblock =
+                                                *firstblock;
+                                        cur->bc_private.b.flist = flist;
+                                }
+                                /*
+                                 * Bump the number of extents we've allocated
+                                 * in this call.
+                                 */
+                                nallocs++;
+                        }
+                        if (cur)
+                                cur->bc_private.b.flags =
+                                        wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
+                        got.br_startoff = aoff;
+                        got.br_startblock = abno;
+                        got.br_blockcount = alen;
+                        got.br_state = XFS_EXT_NORM;    /* assume normal */
+                        /*
+                         * Determine state of extent, and the filesystem.
+                         * A wasdelay extent has been initialized, so
+                         * shouldn't be flagged as unwritten.
+                         */
+                        if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+                                if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
+                                        got.br_state = XFS_EXT_UNWRITTEN;
+                        }
+                        error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+                                firstblock, flist, &tmp_logflags, whichfork,
+                                rsvd);
+                        logflags |= tmp_logflags;
+                        if (error)
+                                goto error0;
+                        lastx = ifp->if_lastex;
+                        ep = &ifp->if_u1.if_extents[lastx];
+                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+                        xfs_bmbt_get_all(ep, &got);
+                        ASSERT(got.br_startoff <= aoff);
+                        ASSERT(got.br_startoff + got.br_blockcount >=
+                                aoff + alen);
+#ifdef DEBUG
+                        if (delay) {
+                                ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
+                                ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
+                        }
+                        ASSERT(got.br_state == XFS_EXT_NORM ||
+                               got.br_state == XFS_EXT_UNWRITTEN);
+#endif
+                        /*
+                         * Fall down into the found allocated space case.
+                         */
+                } else if (inhole) {
+                        /*
+                         * Reading in a hole.
+                         */
+                        mval->br_startoff = bno;
+                        mval->br_startblock = HOLESTARTBLOCK;
+                        mval->br_blockcount =
+                                XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+                        mval->br_state = XFS_EXT_NORM;
+                        bno += mval->br_blockcount;
+                        len -= mval->br_blockcount;
+                        mval++;
+                        n++;
+                        continue;
+                }
+                /*
+                 * Then deal with the allocated space we found.
+                 */
+                ASSERT(ep != NULL);
+                if (trim && (got.br_startoff + got.br_blockcount > obno)) {
+                        if (obno > bno)
+                                bno = obno;
+                        ASSERT((bno >= obno) || (n == 0));
+                        ASSERT(bno < end);
+                        mval->br_startoff = bno;
+                        if (ISNULLSTARTBLOCK(got.br_startblock)) {
+                                ASSERT(!wr || delay);
+                                mval->br_startblock = DELAYSTARTBLOCK;
+                        } else
+                                mval->br_startblock =
+                                        got.br_startblock +
+                                        (bno - got.br_startoff);
+                        /*
+                         * Return the minimum of what we got and what we
+                         * asked for for the length.  We can use the len
+                         * variable here because it is modified below
+                         * and we could have been there before coming
+                         * here if the first part of the allocation
+                         * didn't overlap what was asked for.
+                         */
+                        mval->br_blockcount =
+                                XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
+                                        (bno - got.br_startoff));
+                        mval->br_state = got.br_state;
+                        ASSERT(mval->br_blockcount <= len);
+                } else {
+                        *mval = got;
+                        if (ISNULLSTARTBLOCK(mval->br_startblock)) {
+                                ASSERT(!wr || delay);
+                                mval->br_startblock = DELAYSTARTBLOCK;
+                        }
+                }
+                /*
+                 * Check if writing previously allocated but
+                 * unwritten extents.
+                 */
+                if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
+                    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+                        /*
+                         * Modify (by adding) the state flag, if writing.
+                         */
+                        ASSERT(mval->br_blockcount <= len);
+                        if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+                                cur = xfs_btree_init_cursor(mp,
+                                        tp, NULL, 0, XFS_BTNUM_BMAP,
+                                        ip, whichfork);
+                                cur->bc_private.b.firstblock =
+                                        *firstblock;
+                                cur->bc_private.b.flist = flist;
+                        }
+                        mval->br_state = XFS_EXT_NORM;
+                        error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+                                firstblock, flist, &tmp_logflags, whichfork,
+                                rsvd);
+                        logflags |= tmp_logflags;
+                        if (error)
+                                goto error0;
+                        lastx = ifp->if_lastex;
+                        ep = &ifp->if_u1.if_extents[lastx];
+                        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+                        xfs_bmbt_get_all(ep, &got);
+                        /*
+                         * We may have combined previously unwritten
+                         * space with written space, so generate
+                         * another request.
+                         */
+                        if (mval->br_blockcount < len)
+                                continue;
+                }
+                ASSERT(!trim ||
+                       ((mval->br_startoff + mval->br_blockcount) <= end));
+                ASSERT(!trim || (mval->br_blockcount <= len) ||
+                       (mval->br_startoff < obno));
+                bno = mval->br_startoff + mval->br_blockcount;
+                len = end - bno;
+                if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+                        ASSERT(mval->br_startblock == mval[-1].br_startblock);
+                        ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+                        ASSERT(mval->br_state == mval[-1].br_state);
+                        mval[-1].br_blockcount = mval->br_blockcount;
+                        mval[-1].br_state = mval->br_state;
+                } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+                           mval[-1].br_startblock != DELAYSTARTBLOCK &&
+                           mval[-1].br_startblock != HOLESTARTBLOCK &&
+                           mval->br_startblock ==
+                           mval[-1].br_startblock + mval[-1].br_blockcount &&
+                           (stateless || mval[-1].br_state == mval->br_state)) {
+                        ASSERT(mval->br_startoff ==
+                               mval[-1].br_startoff + mval[-1].br_blockcount);
+                        mval[-1].br_blockcount += mval->br_blockcount;
+                } else if (n > 0 &&
+                           mval->br_startblock == DELAYSTARTBLOCK &&
+                           mval[-1].br_startblock == DELAYSTARTBLOCK &&
+                           mval->br_startoff ==
+                           mval[-1].br_startoff + mval[-1].br_blockcount) {
+                        mval[-1].br_blockcount += mval->br_blockcount;
+                        mval[-1].br_state = mval->br_state;
+                } else if (!((n == 0) &&
+                             ((mval->br_startoff + mval->br_blockcount) <=
+                              obno))) {
+                        mval++;
+                        n++;
+                }
+                /*
+                 * If we're done, stop now.  Stop when we've allocated
+                 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+                 * the transaction may get too big.
+                 */
+                if (bno >= end || n >= *nmap || nallocs >= *nmap)
+                        break;
+                /*
+                 * Else go on to the next record.
+                 */
+                ep++;
+                lastx++;
+                if (lastx >= nextents) {
+                        eof = 1;
+                        prev = got;
+                } else
+                        xfs_bmbt_get_all(ep, &got);
+        }
+        ifp->if_lastex = lastx;
+        *nmap = n;
+        /*
+         * Transform from btree to extents, give it cur.
+         */
+        if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+                ASSERT(wr && cur);
+                error = xfs_bmap_btree_to_extents(tp, ip, cur,
+                        &tmp_logflags, whichfork);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+        }
+        ASSERT(ifp->if_ext_max ==
+               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+               XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+        error = 0;
+error0:
+        /*
+         * Log everything.  Do this after conversion, there's no point in
+         * logging the extent list if we've converted to btree format.
+         */
+        if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+                logflags &= ~XFS_ILOG_FEXT(whichfork);
+        else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+                 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+                logflags &= ~XFS_ILOG_FBROOT(whichfork);
+        /*
+         * Log whatever the flags say, even if error.  Otherwise we might miss
+         * detecting a case where the data is changed, there's an error,
+         * and it's not logged so we don't shutdown when we should.
+         */
+        if (logflags) {
+                ASSERT(tp && wr);
+                xfs_trans_log_inode(tp, ip, logflags);
+        }
+        if (cur) {
+                if (!error) {
+                        ASSERT(*firstblock == NULLFSBLOCK ||
+                               XFS_FSB_TO_AGNO(mp, *firstblock) ==
+                               XFS_FSB_TO_AGNO(mp,
+                                       cur->bc_private.b.firstblock) ||
+                               (flist->xbf_low &&
+                                XFS_FSB_TO_AGNO(mp, *firstblock) <
+                                XFS_FSB_TO_AGNO(mp,
+                                        cur->bc_private.b.firstblock)));
+                        *firstblock = cur->bc_private.b.firstblock;
+                }
+                xfs_btree_del_cursor(cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        if (!error)
+                xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+                        orig_nmap, *nmap);
+        return error;
+}
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block (extent) only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int                                             /* error */
+xfs_bmapi_single(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *ip,            /* incore inode */
+        int             whichfork,      /* data or attr fork */
+        xfs_fsblock_t   *fsb,           /* output: mapped block */
+        xfs_fileoff_t   bno)            /* starting file offs. mapped */
+{
+        int             eof;            /* we've hit the end of extent list */
+        int             error;          /* error return */
+        xfs_bmbt_irec_t got;            /* current extent list record */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_extnum_t    lastx;          /* last useful extent number */
+        xfs_bmbt_irec_t prev;           /* previous extent list record */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (unlikely(
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) {
+               XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW,
+                                ip->i_mount);
+               return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return XFS_ERROR(EIO);
+        XFS_STATS_INC(xs_blk_mapr);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+                &prev);
+        /*
+         * Reading past eof, act as though there's a hole
+         * up to end.
+         */
+        if (eof || got.br_startoff > bno) {
+                *fsb = NULLFSBLOCK;
+                return 0;
+        }
+        ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
+        ASSERT(bno < got.br_startoff + got.br_blockcount);
+        *fsb = got.br_startblock + (bno - got.br_startoff);
+        ifp->if_lastex = lastx;
+        return 0;
+}
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int                                             /* error */
+xfs_bunmapi(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_fileoff_t           bno,            /* starting offset to unmap */
+        xfs_filblks_t           len,            /* length to unmap in file */
+        int                     flags,          /* misc flags */
+        xfs_extnum_t            nexts,          /* number of extents max */
+        xfs_fsblock_t           *firstblock,    /* first allocated block
+                                                   controls a.g. for allocs */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        int                     *done)          /* set if not done yet */
+{
+        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
+        xfs_bmbt_irec_t         del;            /* extent being deleted */
+        int                     eof;            /* is deleting at eof */
+        xfs_bmbt_rec_t          *ep;            /* extent list entry pointer */
+        int                     error;          /* error return value */
+        xfs_extnum_t            extno;          /* extent number in list */
+        xfs_bmbt_irec_t         got;            /* current extent list entry */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        int                     isrt;           /* freeing in rt area */
+        xfs_extnum_t            lastx;          /* last extent index used */
+        int                     logflags;       /* transaction logging flags */
+        xfs_extlen_t            mod;            /* rt extent offset */
+        xfs_mount_t             *mp;            /* mount structure */
+        xfs_extnum_t            nextents;       /* size of extent list */
+        xfs_bmbt_irec_t         prev;           /* previous extent list entry */
+        xfs_fileoff_t           start;          /* first file offset deleted */
+        int                     tmp_logflags;   /* partial logging flags */
+        int                     wasdel;         /* was a delayed alloc extent */
+        int                     whichfork;      /* data or attribute fork */
+        int                     rsvd;           /* OK to allocate reserved blocks */
+        xfs_fsblock_t           sum;
+        xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address);
+        whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+                XFS_ATTR_FORK : XFS_DATA_FORK;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (unlikely(
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+                XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
+                                 ip->i_mount);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+        ASSERT(len > 0);
+        ASSERT(nexts >= 0);
+        ASSERT(ifp->if_ext_max ==
+               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(tp, ip, whichfork)))
+                return error;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                *done = 1;
+                return 0;
+        }
+        XFS_STATS_INC(xs_blk_unmap);
+        isrt = (whichfork == XFS_DATA_FORK) &&
+               (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
+        start = bno;
+        bno = start + len - 1;
+        ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+                &prev);
+        /*
+         * Check to see if the given block number is past the end of the
+         * file, back up to the last block if so...
+         */
+        if (eof) {
+                ep = &ifp->if_u1.if_extents[--lastx];
+                xfs_bmbt_get_all(ep, &got);
+                bno = got.br_startoff + got.br_blockcount - 1;
+        }
+        logflags = 0;
+        if (ifp->if_flags & XFS_IFBROOT) {
+                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                        whichfork);
+                cur->bc_private.b.firstblock = *firstblock;
+                cur->bc_private.b.flist = flist;
+                cur->bc_private.b.flags = 0;
+        } else
+                cur = NULL;
+        extno = 0;
+        while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+               (nexts == 0 || extno < nexts)) {
+                /*
+                 * Is the found extent after a hole in which bno lives?
+                 * Just back up to the previous extent, if so.
+                 */
+                if (got.br_startoff > bno) {
+                        if (--lastx < 0)
+                                break;
+                        ep--;
+                        xfs_bmbt_get_all(ep, &got);
+                }
+                /*
+                 * Is the last block of this extent before the range
+                 * we're supposed to delete?  If so, we're done.
+                 */
+                bno = XFS_FILEOFF_MIN(bno,
+                        got.br_startoff + got.br_blockcount - 1);
+                if (bno < start)
+                        break;
+                /*
+                 * Then deal with the (possibly delayed) allocated space
+                 * we found.
+                 */
+                ASSERT(ep != NULL);
+                del = got;
+                wasdel = ISNULLSTARTBLOCK(del.br_startblock);
+                if (got.br_startoff < start) {
+                        del.br_startoff = start;
+                        del.br_blockcount -= start - got.br_startoff;
+                        if (!wasdel)
+                                del.br_startblock += start - got.br_startoff;
+                }
+                if (del.br_startoff + del.br_blockcount > bno + 1)
+                        del.br_blockcount = bno + 1 - del.br_startoff;
+                sum = del.br_startblock + del.br_blockcount;
+                if (isrt &&
+                    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+                        /*
+                         * Realtime extent not lined up at the end.
+                         * The extent could have been split into written
+                         * and unwritten pieces, or we could just be
+                         * unmapping part of it.  But we can't really
+                         * get rid of part of a realtime extent.
+                         */
+                        if (del.br_state == XFS_EXT_UNWRITTEN ||
+                            !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+                                /*
+                                 * This piece is unwritten, or we're not
+                                 * using unwritten extents.  Skip over it.
+                                 */
+                                ASSERT(bno >= mod);
+                                bno -= mod > del.br_blockcount ?
+                                        del.br_blockcount : mod;
+                                if (bno < got.br_startoff) {
+                                        if (--lastx >= 0)
+                                                xfs_bmbt_get_all(--ep, &got);
+                                }
+                                continue;
+                        }
+                        /*
+                         * It's written, turn it unwritten.
+                         * This is better than zeroing it.
+                         */
+                        ASSERT(del.br_state == XFS_EXT_NORM);
+                        ASSERT(xfs_trans_get_block_res(tp) > 0);
+                        /*
+                         * If this spans a realtime extent boundary,
+                         * chop it back to the start of the one we end at.
+                         */
+                        if (del.br_blockcount > mod) {
+                                del.br_startoff += del.br_blockcount - mod;
+                                del.br_startblock += del.br_blockcount - mod;
+                                del.br_blockcount = mod;
+                        }
+                        del.br_state = XFS_EXT_UNWRITTEN;
+                        error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+                                firstblock, flist, &logflags, XFS_DATA_FORK, 0);
+                        if (error)
+                                goto error0;
+                        goto nodelete;
+                }
+                if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+                        /*
+                         * Realtime extent is lined up at the end but not
+                         * at the front.  We'll get rid of full extents if
+                         * we can.
+                         */
+                        mod = mp->m_sb.sb_rextsize - mod;
+                        if (del.br_blockcount > mod) {
+                                del.br_blockcount -= mod;
+                                del.br_startoff += mod;
+                                del.br_startblock += mod;
+                        } else if ((del.br_startoff == start &&
+                                    (del.br_state == XFS_EXT_UNWRITTEN ||
+                                     xfs_trans_get_block_res(tp) == 0)) ||
+                                   !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+                                /*
+                                 * Can't make it unwritten.  There isn't
+                                 * a full extent here so just skip it.
+                                 */
+                                ASSERT(bno >= del.br_blockcount);
+                                bno -= del.br_blockcount;
+                                if (bno < got.br_startoff) {
+                                        if (--lastx >= 0)
+                                                xfs_bmbt_get_all(--ep, &got);
+                                }
+                                continue;
+                        } else if (del.br_state == XFS_EXT_UNWRITTEN) {
+                                /*
+                                 * This one is already unwritten.
+                                 * It must have a written left neighbor.
+                                 * Unwrite the killed part of that one and
+                                 * try again.
+                                 */
+                                ASSERT(lastx > 0);
+                                xfs_bmbt_get_all(ep - 1, &prev);
+                                ASSERT(prev.br_state == XFS_EXT_NORM);
+                                ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
+                                ASSERT(del.br_startblock ==
+                                       prev.br_startblock + prev.br_blockcount);
+                                if (prev.br_startoff < start) {
+                                        mod = start - prev.br_startoff;
+                                        prev.br_blockcount -= mod;
+                                        prev.br_startblock += mod;
+                                        prev.br_startoff = start;
+                                }
+                                prev.br_state = XFS_EXT_UNWRITTEN;
+                                error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+                                        &prev, firstblock, flist, &logflags,
+                                        XFS_DATA_FORK, 0);
+                                if (error)
+                                        goto error0;
+                                goto nodelete;
+                        } else {
+                                ASSERT(del.br_state == XFS_EXT_NORM);
+                                del.br_state = XFS_EXT_UNWRITTEN;
+                                error = xfs_bmap_add_extent(ip, lastx, &cur,
+                                        &del, firstblock, flist, &logflags,
+                                        XFS_DATA_FORK, 0);
+                                if (error)
+                                        goto error0;
+                                goto nodelete;
+                        }
+                }
+                if (wasdel) {
+                        ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
+                        xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+                                (int)del.br_blockcount, rsvd);
+                        /* Unreserve our quota space */
+                        XFS_TRANS_RESERVE_QUOTA_NBLKS(
+                                mp, NULL, ip, -((long)del.br_blockcount), 0,
+                                isrt ?  XFS_QMOPT_RES_RTBLKS :
+                                        XFS_QMOPT_RES_REGBLKS);
+                        ip->i_delayed_blks -= del.br_blockcount;
+                        if (cur)
+                                cur->bc_private.b.flags |=
+                                        XFS_BTCUR_BPRV_WASDEL;
+                } else if (cur)
+                        cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+                /*
+                 * If it's the case where the directory code is running
+                 * with no block reservation, and the deleted block is in
+                 * the middle of its extent, and the resulting insert
+                 * of an extent would cause transformation to btree format,
+                 * then reject it.  The calling code will then swap
+                 * blocks around instead.
+                 * We have to do this now, rather than waiting for the
+                 * conversion to btree format, since the transaction
+                 * will be dirty.
+                 */
+                if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+                    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+                    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+                    del.br_startoff > got.br_startoff &&
+                    del.br_startoff + del.br_blockcount <
+                    got.br_startoff + got.br_blockcount) {
+                        error = XFS_ERROR(ENOSPC);
+                        goto error0;
+                }
+                error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
+                        &tmp_logflags, whichfork, rsvd);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+                bno = del.br_startoff - 1;
+nodelete:
+                lastx = ifp->if_lastex;
+                /*
+                 * If not done go on to the next (previous) record.
+                 * Reset ep in case the extents array was re-alloced.
+                 */
+                ep = &ifp->if_u1.if_extents[lastx];
+                if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+                        if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
+                            xfs_bmbt_get_startoff(ep) > bno) {
+                                lastx--;
+                                ep--;
+                        }
+                        if (lastx >= 0)
+                                xfs_bmbt_get_all(ep, &got);
+                        extno++;
+                }
+        }
+        ifp->if_lastex = lastx;
+        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+        ASSERT(ifp->if_ext_max ==
+               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+        /*
+         * Convert to a btree if necessary.
+         */
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+            XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+                ASSERT(cur == NULL);
+                error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+                        &cur, 0, &tmp_logflags, whichfork);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+        }
+        /*
+         * transform from btree to extents, give it cur
+         */
+        else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+                 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+                ASSERT(cur != NULL);
+                error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+                        whichfork);
+                logflags |= tmp_logflags;
+                if (error)
+                        goto error0;
+        }
+        /*
+         * transform from extents to local?
+         */
+        ASSERT(ifp->if_ext_max ==
+               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+        error = 0;
+error0:
+        /*
+         * Log everything.  Do this after conversion, there's no point in
+         * logging the extent list if we've converted to btree format.
+         */
+        if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+                logflags &= ~XFS_ILOG_FEXT(whichfork);
+        else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+                 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+                logflags &= ~XFS_ILOG_FBROOT(whichfork);
+        /*
+         * Log inode even in the error case, if the transaction
+         * is dirty we'll need to shut down the filesystem.
+         */
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
+        if (cur) {
+                if (!error) {
+                        *firstblock = cur->bc_private.b.firstblock;
+                        cur->bc_private.b.allocated = 0;
+                }
+                xfs_btree_del_cursor(cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        return error;
+}
+/*
+ * Fcntl interface to xfs_bmapi.
+ */
+int                                             /* error code */
+xfs_getbmap(
+        bhv_desc_t              *bdp,           /* XFS behavior descriptor*/
+        struct getbmap          *bmv,           /* user bmap structure */
+        void                    __user *ap,     /* pointer to user's array */
+        int                     interface)      /* interface flags */
+{
+        __int64_t               bmvend;         /* last block requested */
+        int                     error;          /* return value */
+        __int64_t               fixlen;         /* length for -1 case */
+        int                     i;              /* extent number */
+        xfs_inode_t             *ip;            /* xfs incore inode pointer */
+        vnode_t                 *vp;            /* corresponding vnode */
+        int                     lock;           /* lock state */
+        xfs_bmbt_irec_t         *map;           /* buffer for user's data */
+        xfs_mount_t             *mp;            /* file system mount point */
+        int                     nex;            /* # of user extents can do */
+        int                     nexleft;        /* # of user extents left */
+        int                     subnex;         /* # of bmapi's can do */
+        int                     nmap;           /* number of map entries */
+        struct getbmap          out;            /* output structure */
+        int                     whichfork;      /* data or attr fork */
+        int                     prealloced;     /* this is a file with
+                                                 * preallocated data space */
+        int                     sh_unwritten;   /* true, if unwritten */
+                                                /* extents listed separately */
+        int                     bmapi_flags;    /* flags for xfs_bmapi */
+        __int32_t               oflags;         /* getbmapx bmv_oflags field */
+        vp = BHV_TO_VNODE(bdp);
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+        sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
+        /*      If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
+         *      generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
+         *      bit is set for the file, generate a read event in order
+         *      that the DMAPI application may do its thing before we return
+         *      the extents.  Usually this means restoring user file data to
+         *      regions of the file that look like holes.
+         *
+         *      The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
+         *      BMV_IF_NO_DMAPI_READ so that read events are generated.
+         *      If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
+         *      could misinterpret holes in a DMAPI file as true holes,
+         *      when in fact they may represent offline user data.
+         */
+        if (   (interface & BMV_IF_NO_DMAPI_READ) == 0
+            && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)
+            && whichfork == XFS_DATA_FORK) {
+                error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL);
+                if (error)
+                        return XFS_ERROR(error);
+        }
+        if (whichfork == XFS_ATTR_FORK) {
+                if (XFS_IFORK_Q(ip)) {
+                        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+                            ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+                            ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+                                return XFS_ERROR(EINVAL);
+                } else if (unlikely(
+                           ip->i_d.di_aformat != 0 &&
+                           ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
+                        XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
+                                         ip->i_mount);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+        } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+                   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+                   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+                return XFS_ERROR(EINVAL);
+        if (whichfork == XFS_DATA_FORK) {
+                if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) {
+                        prealloced = 1;
+                        fixlen = XFS_MAXIOFFSET(mp);
+                } else {
+                        prealloced = 0;
+                        fixlen = ip->i_d.di_size;
+                }
+        } else {
+                prealloced = 0;
+                fixlen = 1LL << 32;
+        }
+        if (bmv->bmv_length == -1) {
+                fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+                bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
+                                        (__int64_t)0);
+        } else if (bmv->bmv_length < 0)
+                return XFS_ERROR(EINVAL);
+        if (bmv->bmv_length == 0) {
+                bmv->bmv_entries = 0;
+                return 0;
+        }
+        nex = bmv->bmv_count - 1;
+        if (nex <= 0)
+                return XFS_ERROR(EINVAL);
+        bmvend = bmv->bmv_offset + bmv->bmv_length;
+        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
+                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
+                VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+        }
+        ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+        lock = xfs_ilock_map_shared(ip);
+        /*
+         * Don't let nex be bigger than the number of extents
+         * we can have assuming alternating holes and real extents.
+         */
+        if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+        bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
+                        ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+        /*
+         * Allocate enough space to handle "subnex" maps at a time.
+         */
+        subnex = 16;
+        map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
+        bmv->bmv_entries = 0;
+        if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
+                error = 0;
+                goto unlock_and_return;
+        }
+        nexleft = nex;
+        do {
+                nmap = (nexleft > subnex) ? subnex : nexleft;
+                error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+                                  XFS_BB_TO_FSB(mp, bmv->bmv_length),
+                                  bmapi_flags, NULL, 0, map, &nmap, NULL);
+                if (error)
+                        goto unlock_and_return;
+                ASSERT(nmap <= subnex);
+                for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+                        nexleft--;
+                        oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
+                                        BMV_OF_PREALLOC : 0;
+                        out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
+                        out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+                        ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+                        if (prealloced &&
+                            map[i].br_startblock == HOLESTARTBLOCK &&
+                            out.bmv_offset + out.bmv_length == bmvend) {
+                                /*
+                                 * came to hole at end of file
+                                 */
+                                goto unlock_and_return;
+                        } else {
+                                out.bmv_block =
+                                    (map[i].br_startblock == HOLESTARTBLOCK) ?
+                                        -1 :
+                                        XFS_FSB_TO_DB(ip, map[i].br_startblock);
+                                /* return either getbmap/getbmapx structure. */
+                                if (interface & BMV_IF_EXTENDED) {
+                                        struct  getbmapx        outx;
+                                        GETBMAP_CONVERT(out,outx);
+                                        outx.bmv_oflags = oflags;
+                                        outx.bmv_unused1 = outx.bmv_unused2 = 0;
+                                        if (copy_to_user(ap, &outx,
+                                                        sizeof(outx))) {
+                                                error = XFS_ERROR(EFAULT);
+                                                goto unlock_and_return;
+                                        }
+                                } else {
+                                        if (copy_to_user(ap, &out,
+                                                        sizeof(out))) {
+                                                error = XFS_ERROR(EFAULT);
+                                                goto unlock_and_return;
+                                        }
+                                }
+                                bmv->bmv_offset =
+                                        out.bmv_offset + out.bmv_length;
+                                bmv->bmv_length = MAX((__int64_t)0,
+                                        (__int64_t)(bmvend - bmv->bmv_offset));
+                                bmv->bmv_entries++;
+                                ap = (interface & BMV_IF_EXTENDED) ?
+                                                (void __user *)
+                                        ((struct getbmapx __user *)ap + 1) :
+                                                (void __user *)
+                                        ((struct getbmap __user *)ap + 1);
+                        }
+                }
+        } while (nmap && nexleft && bmv->bmv_length);
+unlock_and_return:
+        xfs_iunlock_map_shared(ip, lock);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        kmem_free(map, subnex * sizeof(*map));
+        return error;
+}
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+int                                     /* error */
+xfs_bmap_isaeof(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fileoff_t   off,            /* file offset in fsblocks */
+        int             whichfork,      /* data or attribute fork */
+        char            *aeof)          /* return value */
+{
+        int             error;          /* error return value */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_bmbt_rec_t  *lastrec;       /* extent list entry pointer */
+        xfs_extnum_t    nextents;       /* size of extent list */
+        xfs_bmbt_irec_t s;              /* expanded extent list entry */
+        ASSERT(whichfork == XFS_DATA_FORK);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(NULL, ip, whichfork)))
+                return error;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                *aeof = 1;
+                return 0;
+        }
+        /*
+         * Go to the last extent
+         */
+        lastrec = &ifp->if_u1.if_extents[nextents - 1];
+        xfs_bmbt_get_all(lastrec, &s);
+        /*
+         * Check we are allocating in the last extent (for delayed allocations)
+         * or past the last extent for non-delayed allocations.
+         */
+        *aeof = (off >= s.br_startoff &&
+                 off < s.br_startoff + s.br_blockcount &&
+                 ISNULLSTARTBLOCK(s.br_startblock)) ||
+                off >= s.br_startoff + s.br_blockcount;
+        return 0;
+}
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.
+ */
+int                                     /* error */
+xfs_bmap_eof(
+        xfs_inode_t     *ip,            /* incore inode pointer */
+        xfs_fileoff_t   endoff,         /* file offset in fsblocks */
+        int             whichfork,      /* data or attribute fork */
+        int             *eof)           /* result value */
+{
+        xfs_fsblock_t   blockcount;     /* extent block count */
+        int             error;          /* error return value */
+        xfs_ifork_t     *ifp;           /* inode fork pointer */
+        xfs_bmbt_rec_t  *lastrec;       /* extent list entry pointer */
+        xfs_extnum_t    nextents;       /* size of extent list */
+        xfs_fileoff_t   startoff;       /* extent starting file offset */
+        ASSERT(whichfork == XFS_DATA_FORK);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+            (error = xfs_iread_extents(NULL, ip, whichfork)))
+                return error;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        if (nextents == 0) {
+                *eof = 1;
+                return 0;
+        }
+        /*
+         * Go to the last extent
+         */
+        lastrec = &ifp->if_u1.if_extents[nextents - 1];
+        startoff = xfs_bmbt_get_startoff(lastrec);
+        blockcount = xfs_bmbt_get_blockcount(lastrec);
+        *eof = endoff >= startoff + blockcount;
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check that the extents list for the inode ip is in the right order.
+ */
+STATIC void
+xfs_bmap_check_extents(
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        int                     whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_rec_t          *base;          /* base of extents list */
+        xfs_bmbt_rec_t          *ep;            /* current extent entry */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        xfs_extnum_t            nextents;       /* number of extents in list */
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+        base = ifp->if_u1.if_extents;
+        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        for (ep = base; ep < &base[nextents - 1]; ep++) {
+                xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
+                        (void *)(ep + 1));
+        }
+}
+STATIC
+xfs_buf_t *
+xfs_bmap_get_bp(
+        xfs_btree_cur_t         *cur,
+        xfs_fsblock_t           bno)
+{
+        int i;
+        xfs_buf_t *bp;
+        if (!cur)
+                return(NULL);
+        bp = NULL;
+        for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+                bp = cur->bc_bufs[i];
+                if (!bp) break;
+                if (XFS_BUF_ADDR(bp) == bno)
+                        break;  /* Found it */
+        }
+        if (i == XFS_BTREE_MAXLEVELS)
+                bp = NULL;
+        if (!bp) { /* Chase down all the log items to see if the bp is there */
+                xfs_log_item_chunk_t    *licp;
+                xfs_trans_t             *tp;
+                tp = cur->bc_tp;
+                licp = &tp->t_items;
+                while (!bp && licp != NULL) {
+                        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                                licp = licp->lic_next;
+                                continue;
+                        }
+                        for (i = 0; i < licp->lic_unused; i++) {
+                                xfs_log_item_desc_t     *lidp;
+                                xfs_log_item_t          *lip;
+                                xfs_buf_log_item_t      *bip;
+                                xfs_buf_t               *lbp;
+                                if (XFS_LIC_ISFREE(licp, i)) {
+                                        continue;
+                                }
+                                lidp = XFS_LIC_SLOT(licp, i);
+                                lip = lidp->lid_item;
+                                if (lip->li_type != XFS_LI_BUF)
+                                        continue;
+                                bip = (xfs_buf_log_item_t *)lip;
+                                lbp = bip->bli_buf;
+                                if (XFS_BUF_ADDR(lbp) == bno) {
+                                        bp = lbp;
+                                        break; /* Found it */
+                                }
+                        }
+                        licp = licp->lic_next;
+                }
+        }
+        return(bp);
+}
+void
+xfs_check_block(
+        xfs_bmbt_block_t        *block,
+        xfs_mount_t             *mp,
+        int                     root,
+        short                   sz)
+{
+        int                     i, j, dmxr;
+        xfs_bmbt_ptr_t          *pp, *thispa;   /* pointer to block address */
+        xfs_bmbt_key_t          *prevp, *keyp;
+        ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+        prevp = NULL;
+        for( i = 1; i <= INT_GET(block->bb_numrecs, ARCH_CONVERT);i++) {
+                dmxr = mp->m_bmap_dmxr[0];
+                if (root) {
+                        keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
+                } else {
+                        keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize,
+                                xfs_bmbt, block, i, dmxr);
+                }
+                if (prevp) {
+                        xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+                }
+                prevp = keyp;
+                /*
+                 * Compare the block numbers to see if there are dups.
+                 */
+                if (root) {
+                        pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
+                } else {
+                        pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+                                xfs_bmbt, block, i, dmxr);
+                }
+                for (j = i+1; j <= INT_GET(block->bb_numrecs, ARCH_CONVERT); j++) {
+                        if (root) {
+                                thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
+                        } else {
+                                thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+                                        xfs_bmbt, block, j, dmxr);
+                        }
+                        if (INT_GET(*thispa, ARCH_CONVERT) ==
+                            INT_GET(*pp, ARCH_CONVERT)) {
+                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
+                                        __FUNCTION__, j, i,
+                                        INT_GET(*thispa, ARCH_CONVERT));
+                                panic("%s: ptrs are equal in node\n",
+                                        __FUNCTION__);
+                        }
+                }
+        }
+}
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+STATIC void
+xfs_bmap_check_leaf_extents(
+        xfs_btree_cur_t         *cur,   /* btree cursor or null */
+        xfs_inode_t             *ip,            /* incore inode pointer */
+        int                     whichfork)      /* data or attr fork */
+{
+        xfs_bmbt_block_t        *block; /* current btree block */
+        xfs_fsblock_t           bno;    /* block # of "block" */
+        xfs_buf_t               *bp;    /* buffer for "block" */
+        int                     error;  /* error return value */
+        xfs_extnum_t            i=0;    /* index into the extents list */
+        xfs_ifork_t             *ifp;   /* fork structure */
+        int                     level;  /* btree level, for checking */
+        xfs_mount_t             *mp;    /* file system mount structure */
+        xfs_bmbt_ptr_t          *pp;    /* pointer to block address */
+        xfs_bmbt_rec_t          *ep, *lastp;    /* extent pointers in block entry */
+        int                     bp_release = 0;
+        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+                return;
+        }
+        bno = NULLFSBLOCK;
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        block = ifp->if_broot;
+        /*
+         * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+         */
+        ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+        level = INT_GET(block->bb_level, ARCH_CONVERT);
+        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+        ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+        bno = INT_GET(*pp, ARCH_CONVERT);
+        /*
+         * Go down the tree until leaf level is reached, following the first
+         * pointer (leftmost) at each level.
+         */
+        while (level-- > 0) {
+                /* See if buf is in cur first */
+                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+                if (bp) {
+                        bp_release = 0;
+                } else {
+                        bp_release = 1;
+                }
+                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                XFS_BMAP_BTREE_REF)))
+                        goto error_norelse;
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                XFS_WANT_CORRUPTED_GOTO(
+                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        error0);
+                if (level == 0)
+                        break;
+                /*
+                 * Check this block for basic sanity (increasing keys and
+                 * no duplicate blocks).
+                 */
+                xfs_check_block(block, mp, 0, 0);
+                pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
+                        1, mp->m_bmap_dmxr[1]);
+                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0);
+                bno = INT_GET(*pp, ARCH_CONVERT);
+                if (bp_release) {
+                        bp_release = 0;
+                        xfs_trans_brelse(NULL, bp);
+                }
+        }
+        /*
+         * Here with bp and block set to the leftmost leaf node in the tree.
+         */
+        i = 0;
+        /*
+         * Loop over all leaf nodes checking that all extents are in the right order.
+         */
+        lastp = NULL;
+        for (;;) {
+                xfs_bmbt_rec_t  *frp;
+                xfs_fsblock_t   nextbno;
+                xfs_extnum_t    num_recs;
+                num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+                /*
+                 * Read-ahead the next leaf block, if any.
+                 */
+                nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+                /*
+                 * Check all the extents to make sure they are OK.
+                 * If we had a previous block, the last entry should
+                 * conform with the first entry in this one.
+                 */
+                frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
+                        block, 1, mp->m_bmap_dmxr[0]);
+                for (ep = frp;ep < frp + (num_recs - 1); ep++) {
+                        if (lastp) {
+                                xfs_btree_check_rec(XFS_BTNUM_BMAP,
+                                        (void *)lastp, (void *)ep);
+                        }
+                        xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
+                                (void *)(ep + 1));
+                }
+                lastp = frp + num_recs - 1; /* For the next iteration */
+                i += num_recs;
+                if (bp_release) {
+                        bp_release = 0;
+                        xfs_trans_brelse(NULL, bp);
+                }
+                bno = nextbno;
+                /*
+                 * If we've reached the end, stop.
+                 */
+                if (bno == NULLFSBLOCK)
+                        break;
+                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+                if (bp) {
+                        bp_release = 0;
+                } else {
+                        bp_release = 1;
+                }
+                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                XFS_BMAP_BTREE_REF)))
+                        goto error_norelse;
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+        }
+        if (bp_release) {
+                bp_release = 0;
+                xfs_trans_brelse(NULL, bp);
+        }
+        return;
+error0:
+        cmn_err(CE_WARN, "%s: at error0", __FUNCTION__);
+        if (bp_release)
+                xfs_trans_brelse(NULL, bp);
+error_norelse:
+        cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
+                i, __FUNCTION__);
+        panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__);
+        return;
+}
+#endif
+/*
+ * Count fsblocks of the given fork.
+ */
+int                                             /* error */
+xfs_bmap_count_blocks(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *ip,            /* incore inode */
+        int                     whichfork,      /* data or attr fork */
+        int                     *count)         /* out: count of blocks */
+{
+        xfs_bmbt_block_t        *block; /* current btree block */
+        xfs_fsblock_t           bno;    /* block # of "block" */
+        xfs_ifork_t             *ifp;   /* fork structure */
+        int                     level;  /* btree level, for checking */
+        xfs_mount_t             *mp;    /* file system mount structure */
+        xfs_bmbt_ptr_t          *pp;    /* pointer to block address */
+        bno = NULLFSBLOCK;
+        mp = ip->i_mount;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+                if (unlikely(xfs_bmap_count_leaves(ifp->if_u1.if_extents,
+                        ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+                        count) < 0)) {
+                        XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                return 0;
+        }
+        /*
+         * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+         */
+        block = ifp->if_broot;
+        ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+        level = INT_GET(block->bb_level, ARCH_CONVERT);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+        ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+        ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+        bno = INT_GET(*pp, ARCH_CONVERT);
+        if (unlikely(xfs_bmap_count_tree(mp, tp, bno, level, count) < 0)) {
+                XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
+                                 mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks is use.
+ */
+int                                     /* error */
+xfs_bmap_count_tree(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_fsblock_t   blockno,        /* file system block number */
+        int             levelin,        /* level in btree */
+        int             *count)         /* Count of blocks */
+{
+        int                     error;
+        xfs_buf_t               *bp, *nbp;
+        int                     level = levelin;
+        xfs_bmbt_ptr_t          *pp;
+        xfs_fsblock_t           bno = blockno;
+        xfs_fsblock_t           nextbno;
+        xfs_bmbt_block_t        *block, *nextblock;
+        int                     numrecs;
+        xfs_bmbt_rec_t          *frp;
+        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+                return error;
+        *count += 1;
+        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+        if (--level) {
+                /* Not at node above leafs, count this level of nodes */
+                nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+                while (nextbno != NULLFSBLOCK) {
+                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
+                                0, &nbp, XFS_BMAP_BTREE_REF)))
+                                return error;
+                        *count += 1;
+                        nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
+                        nextbno = INT_GET(nextblock->bb_rightsib, ARCH_CONVERT);
+                        xfs_trans_brelse(tp, nbp);
+                }
+                /* Dive to the next level */
+                pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+                        xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                bno = INT_GET(*pp, ARCH_CONVERT);
+                if (unlikely((error =
+                     xfs_bmap_count_tree(mp, tp, bno, level, count)) < 0)) {
+                        xfs_trans_brelse(tp, bp);
+                        XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                xfs_trans_brelse(tp, bp);
+        } else {
+                /* count all level 1 nodes and their leaves */
+                for (;;) {
+                        nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+                        numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+                        frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize,
+                                xfs_bmbt, block, 1, mp->m_bmap_dmxr[0]);
+                        if (unlikely(xfs_bmap_count_leaves(frp, numrecs, count) < 0)) {
+                                xfs_trans_brelse(tp, bp);
+                                XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
+                                                 XFS_ERRLEVEL_LOW, mp);
+                                return XFS_ERROR(EFSCORRUPTED);
+                        }
+                        xfs_trans_brelse(tp, bp);
+                        if (nextbno == NULLFSBLOCK)
+                                break;
+                        bno = nextbno;
+                        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                XFS_BMAP_BTREE_REF)))
+                                return error;
+                        *count += 1;
+                        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                }
+        }
+        return 0;
+}
+/*
+ * Count leaf blocks given a pointer to an extent list.
+ */
+int
+xfs_bmap_count_leaves(
+        xfs_bmbt_rec_t          *frp,
+        int                     numrecs,
+        int                     *count)
+{
+        int             b;
+        for ( b = 1; b <= numrecs; b++, frp++)
+                *count += xfs_bmbt_disk_get_blockcount(frp);
+        return 0;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
new file mode 100644
index 000000000000..f1bc22fb26ae
--- /dev/null
+++ b/fs/xfs/xfs_bmap.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BMAP_H__
+#define __XFS_BMAP_H__
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+        xfs_fsblock_t           xbfi_startblock;/* starting fs block number */
+        xfs_extlen_t            xbfi_blockcount;/* number of blocks in extent */
+        struct xfs_bmap_free_item *xbfi_next;   /* link to next entry */
+} xfs_bmap_free_item_t;
+/*
+ * Header for free extent list.
+ */
+typedef struct xfs_bmap_free
+{
+        xfs_bmap_free_item_t    *xbf_first;     /* list of to-be-free extents */
+        int                     xbf_count;      /* count of items on list */
+        int                     xbf_low;        /* kludge: alloc in low mode */
+} xfs_bmap_free_t;
+#define XFS_BMAP_MAX_NMAP       4
+/*
+ * Flags for xfs_bmapi
+ */
+#define XFS_BMAPI_WRITE         0x001   /* write operation: allocate space */
+#define XFS_BMAPI_DELAY         0x002   /* delayed write operation */
+#define XFS_BMAPI_ENTIRE        0x004   /* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA      0x008   /* mapping metadata not user data */
+#define XFS_BMAPI_EXACT         0x010   /* allocate only to spec'd bounds */
+#define XFS_BMAPI_ATTRFORK      0x020   /* use attribute fork not data */
+#define XFS_BMAPI_ASYNC         0x040   /* bunmapi xactions can be async */
+#define XFS_BMAPI_RSVBLOCKS     0x080   /* OK to alloc. reserved data blocks */
+#define XFS_BMAPI_PREALLOC      0x100   /* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE       0x200   /* Ignore state - */
+                                        /* combine contig. space */
+#define XFS_BMAPI_CONTIG        0x400   /* must allocate only one extent */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAPI_AFLAG)
+int xfs_bmapi_aflag(int w);
+#define XFS_BMAPI_AFLAG(w)      xfs_bmapi_aflag(w)
+#else
+#define XFS_BMAPI_AFLAG(w)      ((w) == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0)
+#endif
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define DELAYSTARTBLOCK         ((xfs_fsblock_t)-1LL)
+#define HOLESTARTBLOCK          ((xfs_fsblock_t)-2LL)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_INIT)
+void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp);
+#define XFS_BMAP_INIT(flp,fbp)  xfs_bmap_init(flp,fbp)
+#else
+#define XFS_BMAP_INIT(flp,fbp)  \
+        ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+         (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK)
+#endif
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+typedef struct xfs_bmalloca {
+        xfs_fsblock_t           firstblock; /* i/o first block allocated */
+        xfs_fsblock_t           rval;   /* starting block of new extent */
+        xfs_fileoff_t           off;    /* offset in file filling in */
+        struct xfs_trans        *tp;    /* transaction pointer */
+        struct xfs_inode        *ip;    /* incore inode pointer */
+        struct xfs_bmbt_irec    *prevp; /* extent before the new one */
+        struct xfs_bmbt_irec    *gotp;  /* extent after, or delayed */
+        xfs_extlen_t            alen;   /* i/o length asked/allocated */
+        xfs_extlen_t            total;  /* total blocks needed for xaction */
+        xfs_extlen_t            minlen; /* mininum allocation size (blocks) */
+        xfs_extlen_t            minleft; /* amount must be left after alloc */
+        char                    eof;    /* set if allocating past last extent */
+        char                    wasdel; /* replacing a delayed allocation */
+        char                    userdata;/* set if is user data */
+        char                    low;    /* low on space, using seq'l ags */
+        char                    aeof;   /* allocated space at eof */
+} xfs_bmalloca_t;
+#ifdef __KERNEL__
+#if defined(XFS_BMAP_TRACE)
+/*
+ * Trace operations for bmap extent tracing
+ */
+#define XFS_BMAP_KTRACE_DELETE  1
+#define XFS_BMAP_KTRACE_INSERT  2
+#define XFS_BMAP_KTRACE_PRE_UP  3
+#define XFS_BMAP_KTRACE_POST_UP 4
+#define XFS_BMAP_TRACE_SIZE     4096    /* size of global trace buffer */
+#define XFS_BMAP_KTRACE_SIZE    32      /* size of per-inode trace buffer */
+extern ktrace_t *xfs_bmap_trace_buf;
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ */
+void
+xfs_bmap_trace_exlist(
+        char                    *fname,         /* function name */
+        struct xfs_inode        *ip,            /* incore inode pointer */
+        xfs_extnum_t            cnt,            /* count of entries in list */
+        int                     whichfork);     /* data or attr fork */
+#else
+#define xfs_bmap_trace_exlist(f,ip,c,w)
+#endif
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int                                     /* error code */
+xfs_bmap_add_attrfork(
+        struct xfs_inode        *ip,    /* incore inode pointer */
+        int                                     rsvd);  /* flag for reserved block allocation */
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+        xfs_fsblock_t           bno,            /* fs block number of extent */
+        xfs_filblks_t           len,            /* length of extent */
+        xfs_bmap_free_t         *flist,         /* list of extents */
+        struct xfs_mount        *mp);           /* mount point structure */
+/*
+ * Routine to clean up the free list data structure when
+ * an error occurs during a transaction.
+ */
+void
+xfs_bmap_cancel(
+        xfs_bmap_free_t         *flist);        /* free list to clean up */
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.  Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        int                     whichfork);     /* data or attr fork */
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int                                             /* error */
+xfs_bmap_finish(
+        struct xfs_trans        **tp,           /* transaction pointer addr */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        xfs_fsblock_t           firstblock,     /* controlled a.g. for allocs */
+        int                     *committed);    /* xact committed or not */
+/*
+ * Returns the file-relative block number of the first unused block in the file.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ */
+int                                             /* error */
+xfs_bmap_first_unused(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_extlen_t            len,            /* size of hole to find */
+        xfs_fileoff_t           *unused,        /* unused block num */
+        int                     whichfork);     /* data or attr fork */
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int                                             /* error */
+xfs_bmap_last_before(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_fileoff_t           *last_block,    /* last block */
+        int                     whichfork);     /* data or attr fork */
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int                                             /* error */
+xfs_bmap_last_offset(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_fileoff_t           *unused,        /* last block num */
+        int                     whichfork);     /* data or attr fork */
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int
+xfs_bmap_one_block(
+        struct xfs_inode        *ip,            /* incore inode */
+        int                     whichfork);     /* data or attr fork */
+/*
+ * Read in the extents to iu_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in.
+ */
+int                                             /* error */
+xfs_bmap_read_extents(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        int                     whichfork);     /* data or attr fork */
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int                                             /* error */
+xfs_bmapi(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_fileoff_t           bno,            /* starting file offs. mapped */
+        xfs_filblks_t           len,            /* length to map in file */
+        int                     flags,          /* XFS_BMAPI_... */
+        xfs_fsblock_t           *firstblock,    /* first allocated block
+                                                   controls a.g. for allocs */
+        xfs_extlen_t            total,          /* total blocks needed */
+        struct xfs_bmbt_irec    *mval,          /* output: map values */
+        int                     *nmap,          /* i/o: mval size/count */
+        xfs_bmap_free_t         *flist);        /* i/o: list extents to free */
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int                                             /* error */
+xfs_bmapi_single(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        int                     whichfork,      /* data or attr fork */
+        xfs_fsblock_t           *fsb,           /* output: mapped block */
+        xfs_fileoff_t           bno);           /* starting file offs. mapped */
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.  If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int                                             /* error */
+xfs_bunmapi(
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* incore inode */
+        xfs_fileoff_t           bno,            /* starting offset to unmap */
+        xfs_filblks_t           len,            /* length to unmap in file */
+        int                     flags,          /* XFS_BMAPI_... */
+        xfs_extnum_t            nexts,          /* number of extents max */
+        xfs_fsblock_t           *firstblock,    /* first allocated block
+                                                   controls a.g. for allocs */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        int                     *done);         /* set if not done yet */
+/*
+ * Fcntl interface to xfs_bmapi.
+ */
+int                                             /* error code */
+xfs_getbmap(
+        bhv_desc_t              *bdp,           /* XFS behavior descriptor*/
+        struct getbmap          *bmv,           /* user bmap structure */
+        void                    __user *ap,     /* pointer to user's array */
+        int                     iflags);        /* interface flags */
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+int
+xfs_bmap_isaeof(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           off,
+        int                     whichfork,
+        char                    *aeof);
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary
+ */
+int
+xfs_bmap_eof(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           endoff,
+        int                     whichfork,
+        int                     *eof);
+/*
+ * Count fsblocks of the given fork.
+ */
+int
+xfs_bmap_count_blocks(
+        xfs_trans_t             *tp,
+        struct xfs_inode        *ip,
+        int                     whichfork,
+        int                     *count);
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+        xfs_bmbt_rec_t          *ep,
+        xfs_extnum_t            num);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
new file mode 100644
index 000000000000..163305a79fcc
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -0,0 +1,2807 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#if defined(XFS_BMBT_TRACE)
+ktrace_t        *xfs_bmbt_trace_buf;
+#endif
+/*
+ * Prototypes for internal btree functions.
+ */
+STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
+STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
+                xfs_bmbt_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
+#if defined(XFS_BMBT_TRACE)
+static char     ARGS[] = "args";
+static char     ENTRY[] = "entry";
+static char     ERROR[] = "error";
+#undef EXIT
+static char     EXIT[] = "exit";
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+STATIC void
+xfs_bmbt_trace_enter(
+        char            *func,
+        xfs_btree_cur_t *cur,
+        char            *s,
+        int             type,
+        int             line,
+        __psunsigned_t  a0,
+        __psunsigned_t  a1,
+        __psunsigned_t  a2,
+        __psunsigned_t  a3,
+        __psunsigned_t  a4,
+        __psunsigned_t  a5,
+        __psunsigned_t  a6,
+        __psunsigned_t  a7,
+        __psunsigned_t  a8,
+        __psunsigned_t  a9,
+        __psunsigned_t  a10)
+{
+        xfs_inode_t     *ip;
+        int             whichfork;
+        ip = cur->bc_private.b.ip;
+        whichfork = cur->bc_private.b.whichfork;
+        ktrace_enter(xfs_bmbt_trace_buf,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+        ASSERT(ip->i_btrace);
+        ktrace_enter(ip->i_btrace,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argbi(
+        char            *func,
+        xfs_btree_cur_t *cur,
+        xfs_buf_t       *b,
+        int             i,
+        int             line)
+{
+        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
+                (__psunsigned_t)b, i, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+STATIC void
+xfs_bmbt_trace_argbii(
+        char            *func,
+        xfs_btree_cur_t *cur,
+        xfs_buf_t       *b,
+        int             i0,
+        int             i1,
+        int             line)
+{
+        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
+                (__psunsigned_t)b, i0, i1, 0,
+                0, 0, 0, 0,
+                0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argfffi(
+        char                    *func,
+        xfs_btree_cur_t         *cur,
+        xfs_dfiloff_t           o,
+        xfs_dfsbno_t            b,
+        xfs_dfilblks_t          i,
+        int                     j,
+        int                     line)
+{
+        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
+                o >> 32, (int)o, b >> 32, (int)b,
+                i >> 32, (int)i, (int)j, 0,
+                0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argi(
+        char            *func,
+        xfs_btree_cur_t *cur,
+        int             i,
+        int             line)
+{
+        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
+                i, 0, 0, 0,
+                0, 0, 0, 0,
+                0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+STATIC void
+xfs_bmbt_trace_argifk(
+        char                    *func,
+        xfs_btree_cur_t         *cur,
+        int                     i,
+        xfs_fsblock_t           f,
+        xfs_bmbt_key_t          *k,
+        int                     line)
+{
+        xfs_dfsbno_t            d;
+        xfs_dfiloff_t           o;
+        d = (xfs_dfsbno_t)f;
+        o = INT_GET(k->br_startoff, ARCH_CONVERT);
+        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
+                i, d >> 32, (int)d, o >> 32,
+                (int)o, 0, 0, 0,
+                0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+STATIC void
+xfs_bmbt_trace_argifr(
+        char                    *func,
+        xfs_btree_cur_t         *cur,
+        int                     i,
+        xfs_fsblock_t           f,
+        xfs_bmbt_rec_t          *r,
+        int                     line)
+{
+        xfs_dfsbno_t            b;
+        xfs_dfilblks_t          c;
+        xfs_dfsbno_t            d;
+        xfs_dfiloff_t           o;
+        xfs_bmbt_irec_t         s;
+        d = (xfs_dfsbno_t)f;
+        xfs_bmbt_disk_get_all(r, &s);
+        o = (xfs_dfiloff_t)s.br_startoff;
+        b = (xfs_dfsbno_t)s.br_startblock;
+        c = s.br_blockcount;
+        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
+                i, d >> 32, (int)d, o >> 32,
+                (int)o, b >> 32, (int)b, c >> 32,
+                (int)c, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+STATIC void
+xfs_bmbt_trace_argik(
+        char                    *func,
+        xfs_btree_cur_t         *cur,
+        int                     i,
+        xfs_bmbt_key_t          *k,
+        int                     line)
+{
+        xfs_dfiloff_t           o;
+        o = INT_GET(k->br_startoff, ARCH_CONVERT);
+        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
+                i, o >> 32, (int)o, 0,
+                0, 0, 0, 0,
+                0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+STATIC void
+xfs_bmbt_trace_cursor(
+        char            *func,
+        xfs_btree_cur_t *cur,
+        char            *s,
+        int             line)
+{
+        xfs_bmbt_rec_t  r;
+        xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+        xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
+                (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
+                cur->bc_private.b.allocated,
+                INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT),
+                (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
+                (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
+                (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+                (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
+#define XFS_BMBT_TRACE_ARGBI(c,b,i)     \
+        xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__)
+#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
+        xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__)
+#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
+        xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__)
+#define XFS_BMBT_TRACE_ARGI(c,i)        \
+        xfs_bmbt_trace_argi(fname, c, i, __LINE__)
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)  \
+        xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__)
+#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
+        xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__)
+#define XFS_BMBT_TRACE_ARGIK(c,i,k)     \
+        xfs_bmbt_trace_argik(fname, c, i, k, __LINE__)
+#define XFS_BMBT_TRACE_CURSOR(c,s)      \
+        xfs_bmbt_trace_cursor(fname, c, s, __LINE__)
+#else
+#define XFS_BMBT_TRACE_ARGBI(c,b,i)
+#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
+#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
+#define XFS_BMBT_TRACE_ARGI(c,i)
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)
+#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
+#define XFS_BMBT_TRACE_ARGIK(c,i,k)
+#define XFS_BMBT_TRACE_CURSOR(c,s)
+#endif  /* XFS_BMBT_TRACE */
+/*
+ * Internal functions.
+ */
+/*
+ * Delete record pointed to by cur/level.
+ */
+STATIC int                                      /* error */
+xfs_bmbt_delrec(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        xfs_bmbt_block_t        *block;         /* bmap btree block */
+        xfs_fsblock_t           bno;            /* fs-relative block number */
+        xfs_buf_t               *bp;            /* buffer for block */
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_delrec";
+#endif
+        int                     i;              /* loop counter */
+        int                     j;              /* temp state */
+        xfs_bmbt_key_t          key;            /* bmap btree key */
+        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
+        xfs_fsblock_t           lbno;           /* left sibling block number */
+        xfs_buf_t               *lbp;           /* left buffer pointer */
+        xfs_bmbt_block_t        *left;          /* left btree block */
+        xfs_bmbt_key_t          *lkp;           /* left btree key */
+        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
+        int                     lrecs=0;        /* left record count */
+        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
+        xfs_mount_t             *mp;            /* file system mount point */
+        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
+        int                     ptr;            /* key/record index */
+        xfs_fsblock_t           rbno;           /* right sibling block number */
+        xfs_buf_t               *rbp;           /* right buffer pointer */
+        xfs_bmbt_block_t        *right;         /* right btree block */
+        xfs_bmbt_key_t          *rkp;           /* right btree key */
+        xfs_bmbt_rec_t          *rp;            /* pointer to bmap btree rec */
+        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
+        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
+        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
+        int                     rrecs=0;        /* right record count */
+        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
+        xfs_btree_cur_t         *tcur;          /* temporary btree cursor */
+        int                     numrecs;        /* temporary numrec count */
+        int                     numlrecs, numrrecs;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGI(cur, level);
+        ptr = cur->bc_ptrs[level];
+        tcur = (xfs_btree_cur_t *)0;
+        if (ptr == 0) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        block = xfs_bmbt_get_block(cur, level, &bp);
+        numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                goto error0;
+        }
+#endif
+        if (ptr > numrecs) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        XFS_STATS_INC(xs_bmbt_delrec);
+        if (level > 0) {
+                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+#ifdef DEBUG
+                for (i = ptr; i < numrecs; i++) {
+                        if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                goto error0;
+                        }
+                }
+#endif
+                if (ptr < numrecs) {
+                        memmove(&kp[ptr - 1], &kp[ptr],
+                                (numrecs - ptr) * sizeof(*kp));
+                        memmove(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */
+                                (numrecs - ptr) * sizeof(*pp));
+                        xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
+                        xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
+                }
+        } else {
+                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
+                if (ptr < numrecs) {
+                        memmove(&rp[ptr - 1], &rp[ptr],
+                                (numrecs - ptr) * sizeof(*rp));
+                        xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
+                }
+                if (ptr == 1) {
+                        INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp));
+                        kp = &key;
+                }
+        }
+        numrecs--;
+        INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
+        /*
+         * We're at the root level.
+         * First, shrink the root block in-memory.
+         * Try to get rid of the next level down.
+         * If we can't then there's nothing left to do.
+         */
+        if (level == cur->bc_nlevels - 1) {
+                xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                        cur->bc_private.b.whichfork);
+                if ((error = xfs_bmbt_killroot(cur))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 1;
+                return 0;
+        }
+        if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                goto error0;
+        }
+        if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 1;
+                return 0;
+        }
+        rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+        lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+        /*
+         * One child of root, need to get a chance to copy its contents
+         * into the root and delete it. Can't go up to next level,
+         * there's nothing to delete there.
+         */
+        if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
+            level == cur->bc_nlevels - 2) {
+                if ((error = xfs_bmbt_killroot(cur))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 1;
+                return 0;
+        }
+        ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
+        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                goto error0;
+        }
+        bno = NULLFSBLOCK;
+        if (rbno != NULLFSBLOCK) {
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_bmbt_increment(tcur, level, &i))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                rbp = tcur->bc_bufs[level];
+                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+#endif
+                bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+                if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+                        if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                goto error0;
+                        }
+                        if (i) {
+                                ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                if (level > 0) {
+                                        if ((error = xfs_bmbt_decrement(cur,
+                                                        level, &i))) {
+                                                XFS_BMBT_TRACE_CURSOR(cur,
+                                                        ERROR);
+                                                goto error0;
+                                        }
+                                }
+                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+                if (lbno != NULLFSBLOCK) {
+                        i = xfs_btree_firstrec(tcur, level);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                goto error0;
+                        }
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                }
+        }
+        if (lbno != NULLFSBLOCK) {
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * decrement to last in block
+                 */
+                if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                lbp = tcur->bc_bufs[level];
+                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+#endif
+                bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+                if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+                        if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                goto error0;
+                        }
+                        if (i) {
+                                ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                if (level == 0)
+                                        cur->bc_ptrs[0]++;
+                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        }
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        tcur = NULL;
+        mp = cur->bc_mp;
+        ASSERT(bno != NULLFSBLOCK);
+        if (lbno != NULLFSBLOCK &&
+            lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+                rbno = bno;
+                right = block;
+                rbp = bp;
+                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
+                                XFS_BMAP_BTREE_REF))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+        } else if (rbno != NULLFSBLOCK &&
+                   rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+                   XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+                lbno = bno;
+                left = block;
+                lbp = bp;
+                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
+                                XFS_BMAP_BTREE_REF))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        } else {
+                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 1;
+                return 0;
+        }
+        numlrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        numrrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+        if (level > 0) {
+                lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
+                lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
+                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = 0; i < numrrecs; i++) {
+                        if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                goto error0;
+                        }
+                }
+#endif
+                memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
+                memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
+                xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+                xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+        } else {
+                lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
+                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+                memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
+                xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+        }
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, numrrecs);
+        left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
+        xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
+        if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
+                                INT_GET(left->bb_rightsib, ARCH_CONVERT),
+                                0, &rrbp, XFS_BMAP_BTREE_REF))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
+                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        goto error0;
+                }
+                INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
+                cur->bc_private.b.flist, mp);
+        cur->bc_private.b.ip->i_d.di_nblocks--;
+        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
+                        XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(cur->bc_tp, rbp);
+        if (bp != lbp) {
+                cur->bc_bufs[level] = lbp;
+                cur->bc_ptrs[level] += lrecs;
+                cur->bc_ra[level] = 0;
+        } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                goto error0;
+        }
+        if (level > 0)
+                cur->bc_ptrs[level]--;
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = 2;
+        return 0;
+error0:
+        if (tcur)
+                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+#ifdef DEBUG
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_bmbt_get_rec(
+        xfs_btree_cur_t         *cur,
+        xfs_fileoff_t           *off,
+        xfs_fsblock_t           *bno,
+        xfs_filblks_t           *len,
+        xfs_exntst_t            *state,
+        int                     *stat)
+{
+        xfs_bmbt_block_t        *block;
+        xfs_buf_t               *bp;
+#ifdef DEBUG
+        int                     error;
+#endif
+        int                     ptr;
+        xfs_bmbt_rec_t          *rp;
+        block = xfs_bmbt_get_block(cur, 0, &bp);
+        ptr = cur->bc_ptrs[0];
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, block, 0, bp)))
+                return error;
+#endif
+        if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+                *stat = 0;
+                return 0;
+        }
+        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+        *off = xfs_bmbt_disk_get_startoff(rp);
+        *bno = xfs_bmbt_disk_get_startblock(rp);
+        *len = xfs_bmbt_disk_get_blockcount(rp);
+        *state = xfs_bmbt_disk_get_state(rp);
+        *stat = 1;
+        return 0;
+}
+#endif
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int                                      /* error */
+xfs_bmbt_insrec(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        xfs_fsblock_t           *bnop,
+        xfs_bmbt_rec_t          *recp,
+        xfs_btree_cur_t         **curp,
+        int                     *stat)          /* no-go/done/continue */
+{
+        xfs_bmbt_block_t        *block;         /* bmap btree block */
+        xfs_buf_t               *bp;            /* buffer for block */
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_insrec";
+#endif
+        int                     i;              /* loop index */
+        xfs_bmbt_key_t          key;            /* bmap btree key */
+        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
+        int                     logflags;       /* inode logging flags */
+        xfs_fsblock_t           nbno;           /* new block number */
+        struct xfs_btree_cur    *ncur;          /* new btree cursor */
+        xfs_bmbt_key_t          nkey;           /* new btree key value */
+        xfs_bmbt_rec_t          nrec;           /* new record count */
+        int                     optr;           /* old key/record index */
+        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
+        int                     ptr;            /* key/record index */
+        xfs_bmbt_rec_t          *rp=NULL;       /* pointer to bmap btree rec */
+        int                     numrecs;
+        ASSERT(level < cur->bc_nlevels);
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
+        ncur = (xfs_btree_cur_t *)0;
+        INT_SET(key.br_startoff, ARCH_CONVERT,
+                xfs_bmbt_disk_get_startoff(recp));
+        optr = ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        XFS_STATS_INC(xs_bmbt_insrec);
+        block = xfs_bmbt_get_block(cur, level, &bp);
+        numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        if (ptr <= numrecs) {
+                if (level == 0) {
+                        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+                        xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
+                } else {
+                        kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
+                        xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
+                }
+        }
+#endif
+        nbno = NULLFSBLOCK;
+        if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+                if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
+                        /*
+                         * A root block, that can be made bigger.
+                         */
+                        xfs_iroot_realloc(cur->bc_private.b.ip, 1,
+                                cur->bc_private.b.whichfork);
+                        block = xfs_bmbt_get_block(cur, level, &bp);
+                } else if (level == cur->bc_nlevels - 1) {
+                        if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
+                            *stat == 0) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+                        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                                logflags);
+                        block = xfs_bmbt_get_block(cur, level, &bp);
+                } else {
+                        if ((error = xfs_bmbt_rshift(cur, level, &i))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+                        if (i) {
+                                /* nothing */
+                        } else {
+                                if ((error = xfs_bmbt_lshift(cur, level, &i))) {
+                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                        return error;
+                                }
+                                if (i) {
+                                        optr = ptr = cur->bc_ptrs[level];
+                                } else {
+                                        if ((error = xfs_bmbt_split(cur, level,
+                                                        &nbno, &nkey, &ncur,
+                                                        &i))) {
+                                                XFS_BMBT_TRACE_CURSOR(cur,
+                                                        ERROR);
+                                                return error;
+                                        }
+                                        if (i) {
+                                                block = xfs_bmbt_get_block(
+                                                            cur, level, &bp);
+#ifdef DEBUG
+                                                if ((error =
+                                                    xfs_btree_check_lblock(cur,
+                                                            block, level, bp))) {
+                                                        XFS_BMBT_TRACE_CURSOR(
+                                                                cur, ERROR);
+                                                        return error;
+                                                }
+#endif
+                                                ptr = cur->bc_ptrs[level];
+                                                xfs_bmbt_disk_set_allf(&nrec,
+                                                        nkey.br_startoff, 0, 0,
+                                                        XFS_EXT_NORM);
+                                        } else {
+                                                XFS_BMBT_TRACE_CURSOR(cur,
+                                                        EXIT);
+                                                *stat = 0;
+                                                return 0;
+                                        }
+                                }
+                        }
+                }
+        }
+        numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+        if (level > 0) {
+                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+#ifdef DEBUG
+                for (i = numrecs; i >= ptr; i--) {
+                        if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT),
+                                        level))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+                }
+#endif
+                memmove(&kp[ptr], &kp[ptr - 1],
+                        (numrecs - ptr + 1) * sizeof(*kp));
+                memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */
+                        (numrecs - ptr + 1) * sizeof(*pp));
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop,
+                                level))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+#endif
+                kp[ptr - 1] = key;
+                INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+                numrecs++;
+                INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+                xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
+                xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
+        } else {
+                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
+                memmove(&rp[ptr], &rp[ptr - 1],
+                        (numrecs - ptr + 1) * sizeof(*rp));
+                rp[ptr - 1] = *recp;
+                numrecs++;
+                INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+                xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
+        }
+        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+        if (ptr < numrecs) {
+                if (level == 0)
+                        xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
+                                rp + ptr);
+                else
+                        xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
+                                kp + ptr);
+        }
+#endif
+        if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        *bnop = nbno;
+        if (nbno != NULLFSBLOCK) {
+                *recp = nrec;
+                *curp = ncur;
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = 1;
+        return 0;
+}
+STATIC int
+xfs_bmbt_killroot(
+        xfs_btree_cur_t         *cur)
+{
+        xfs_bmbt_block_t        *block;
+        xfs_bmbt_block_t        *cblock;
+        xfs_buf_t               *cbp;
+        xfs_bmbt_key_t          *ckp;
+        xfs_bmbt_ptr_t          *cpp;
+#ifdef DEBUG
+        int                     error;
+#endif
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_killroot";
+#endif
+        int                     i;
+        xfs_bmbt_key_t          *kp;
+        xfs_inode_t             *ip;
+        xfs_ifork_t             *ifp;
+        int                     level;
+        xfs_bmbt_ptr_t          *pp;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        level = cur->bc_nlevels - 1;
+        ASSERT(level >= 1);
+        /*
+         * Don't deal with the root block needs to be a leaf case.
+         * We're just going to turn the thing back into extents anyway.
+         */
+        if (level == 1) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                return 0;
+        }
+        block = xfs_bmbt_get_block(cur, level, &cbp);
+        /*
+         * Give up if the root has multiple children.
+         */
+        if (INT_GET(block->bb_numrecs, ARCH_CONVERT) != 1) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                return 0;
+        }
+        /*
+         * Only do this if the next level will fit.
+         * Then the data must be copied up to the inode,
+         * instead of freeing the root you free the next level.
+         */
+        cbp = cur->bc_bufs[level - 1];
+        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+        if (INT_GET(cblock->bb_numrecs, ARCH_CONVERT) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                return 0;
+        }
+        ASSERT(INT_GET(cblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
+        ASSERT(INT_GET(cblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
+        ip = cur->bc_private.b.ip;
+        ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
+        ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
+               XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
+        i = (int)(INT_GET(cblock->bb_numrecs, ARCH_CONVERT) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
+        if (i) {
+                xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
+                block = ifp->if_broot;
+        }
+        INT_MOD(block->bb_numrecs, ARCH_CONVERT, i);
+        ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) == INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
+        memcpy(kp, ckp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
+        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
+#ifdef DEBUG
+        for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
+                if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+        }
+#endif
+        memcpy(pp, cpp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
+        xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
+                        cur->bc_private.b.flist, cur->bc_mp);
+        ip->i_d.di_nblocks--;
+        XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
+                        XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(cur->bc_tp, cbp);
+        cur->bc_bufs[level - 1] = NULL;
+        INT_MOD(block->bb_level, ARCH_CONVERT, -1);
+        xfs_trans_log_inode(cur->bc_tp, ip,
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+        cur->bc_nlevels--;
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        return 0;
+}
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_bmbt_log_keys(
+        xfs_btree_cur_t *cur,
+        xfs_buf_t       *bp,
+        int             kfirst,
+        int             klast)
+{
+#ifdef XFS_BMBT_TRACE
+        static char     fname[] = "xfs_bmbt_log_keys";
+#endif
+        xfs_trans_t     *tp;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
+        tp = cur->bc_tp;
+        if (bp) {
+                xfs_bmbt_block_t        *block;
+                int                     first;
+                xfs_bmbt_key_t          *kp;
+                int                     last;
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
+                first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+                last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+                xfs_trans_log_buf(tp, bp, first, last);
+        } else {
+                xfs_inode_t              *ip;
+                ip = cur->bc_private.b.ip;
+                xfs_trans_log_inode(tp, ip,
+                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+/*
+ * Log pointer values from the btree block.
+ */
+STATIC void
+xfs_bmbt_log_ptrs(
+        xfs_btree_cur_t *cur,
+        xfs_buf_t       *bp,
+        int             pfirst,
+        int             plast)
+{
+#ifdef XFS_BMBT_TRACE
+        static char     fname[] = "xfs_bmbt_log_ptrs";
+#endif
+        xfs_trans_t     *tp;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
+        tp = cur->bc_tp;
+        if (bp) {
+                xfs_bmbt_block_t        *block;
+                int                     first;
+                int                     last;
+                xfs_bmbt_ptr_t          *pp;
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
+                first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+                last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+                xfs_trans_log_buf(tp, bp, first, last);
+        } else {
+                xfs_inode_t             *ip;
+                ip = cur->bc_private.b.ip;
+                xfs_trans_log_inode(tp, ip,
+                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ */
+STATIC int                              /* error */
+xfs_bmbt_lookup(
+        xfs_btree_cur_t         *cur,
+        xfs_lookup_t            dir,
+        int                     *stat)          /* success/failure */
+{
+        xfs_bmbt_block_t        *block=NULL;
+        xfs_buf_t               *bp;
+        xfs_daddr_t             d;
+        xfs_sfiloff_t           diff;
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char     fname[] = "xfs_bmbt_lookup";
+#endif
+        xfs_fsblock_t           fsbno=0;
+        int                     high;
+        int                     i;
+        int                     keyno=0;
+        xfs_bmbt_key_t          *kkbase=NULL;
+        xfs_bmbt_key_t          *kkp;
+        xfs_bmbt_rec_t          *krbase=NULL;
+        xfs_bmbt_rec_t          *krp;
+        int                     level;
+        int                     low;
+        xfs_mount_t             *mp;
+        xfs_bmbt_ptr_t          *pp;
+        xfs_bmbt_irec_t         *rp;
+        xfs_fileoff_t           startoff;
+        xfs_trans_t             *tp;
+        XFS_STATS_INC(xs_bmbt_lookup);
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGI(cur, (int)dir);
+        tp = cur->bc_tp;
+        mp = cur->bc_mp;
+        rp = &cur->bc_rec.b;
+        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+                if (level < cur->bc_nlevels - 1) {
+                        d = XFS_FSB_TO_DADDR(mp, fsbno);
+                        bp = cur->bc_bufs[level];
+                        if (bp && XFS_BUF_ADDR(bp) != d)
+                                bp = (xfs_buf_t *)0;
+                        if (!bp) {
+                                if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
+                                                0, &bp, XFS_BMAP_BTREE_REF))) {
+                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                        return error;
+                                }
+                                xfs_btree_setbuf(cur, level, bp);
+                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                                if ((error = xfs_btree_check_lblock(cur, block,
+                                                level, bp))) {
+                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                        return error;
+                                }
+                        } else
+                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                } else
+                        block = xfs_bmbt_get_block(cur, level, &bp);
+                if (diff == 0)
+                        keyno = 1;
+                else {
+                        if (level > 0)
+                                kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
+                        else
+                                krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
+                        low = 1;
+                        if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+                                ASSERT(level == 0);
+                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                                *stat = 0;
+                                return 0;
+                        }
+                        while (low <= high) {
+                                XFS_STATS_INC(xs_bmbt_compare);
+                                keyno = (low + high) >> 1;
+                                if (level > 0) {
+                                        kkp = kkbase + keyno - 1;
+                                        startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT);
+                                } else {
+                                        krp = krbase + keyno - 1;
+                                        startoff = xfs_bmbt_disk_get_startoff(krp);
+                                }
+                                diff = (xfs_sfiloff_t)
+                                                (startoff - rp->br_startoff);
+                                if (diff < 0)
+                                        low = keyno + 1;
+                                else if (diff > 0)
+                                        high = keyno - 1;
+                                else
+                                        break;
+                        }
+                }
+                if (level > 0) {
+                        if (diff > 0 && --keyno < 1)
+                                keyno = 1;
+                        pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
+#ifdef DEBUG
+                        if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+#endif
+                        fsbno = INT_GET(*pp, ARCH_CONVERT);
+                        cur->bc_ptrs[level] = keyno;
+                }
+        }
+        if (dir != XFS_LOOKUP_LE && diff < 0) {
+                keyno++;
+                /*
+                 * If ge search and we went off the end of the block, but it's
+                 * not the last block, we're in the wrong block.
+                 */
+                if (dir == XFS_LOOKUP_GE && keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+                    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+                        cur->bc_ptrs[0] = keyno;
+                        if ((error = xfs_bmbt_increment(cur, 0, &i))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                        *stat = 1;
+                        return 0;
+                }
+        }
+        else if (dir == XFS_LOOKUP_LE && diff > 0)
+                keyno--;
+        cur->bc_ptrs[0] = keyno;
+        if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+        } else {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+        }
+        return 0;
+}
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_bmbt_lshift(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_lshift";
+#endif
+#ifdef DEBUG
+        int                     i;              /* loop counter */
+#endif
+        xfs_bmbt_key_t          key;            /* bmap btree key */
+        xfs_buf_t               *lbp;           /* left buffer pointer */
+        xfs_bmbt_block_t        *left;          /* left btree block */
+        xfs_bmbt_key_t          *lkp=NULL;      /* left btree key */
+        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
+        int                     lrecs;          /* left record count */
+        xfs_bmbt_rec_t          *lrp=NULL;      /* left record pointer */
+        xfs_mount_t             *mp;            /* file system mount point */
+        xfs_buf_t               *rbp;           /* right buffer pointer */
+        xfs_bmbt_block_t        *right;         /* right btree block */
+        xfs_bmbt_key_t          *rkp=NULL;      /* right btree key */
+        xfs_bmbt_ptr_t          *rpp=NULL;      /* right address pointer */
+        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
+        int                     rrecs;          /* right record count */
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGI(cur, level);
+        if (level == cur->bc_nlevels - 1) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        rbp = cur->bc_bufs[level];
+        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+#endif
+        if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        if (cur->bc_ptrs[level] <= 1) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        mp = cur->bc_mp;
+        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0,
+                        &lbp, XFS_BMAP_BTREE_REF))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+        if (level > 0) {
+                lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
+                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+                *lkp = *rkp;
+                xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
+                lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
+                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+#endif
+                *lpp = *rpp; /* INT_: direct copy */
+                xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
+        } else {
+                lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
+                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+                *lrp = *rrp;
+                xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
+        }
+        INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs);
+        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+        if (level > 0)
+                xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
+        else
+                xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
+#endif
+        rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1;
+        INT_SET(right->bb_numrecs, ARCH_CONVERT, rrecs);
+        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
+        if (level > 0) {
+#ifdef DEBUG
+                for (i = 0; i < rrecs; i++) {
+                        if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+                                        level))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+                }
+#endif
+                memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
+                memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
+                xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
+                xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
+        } else {
+                memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
+                xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
+                INT_SET(key.br_startoff, ARCH_CONVERT,
+                        xfs_bmbt_disk_get_startoff(rrp));
+                rkp = &key;
+        }
+        if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        cur->bc_ptrs[level]--;
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_bmbt_rshift(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_rshift";
+#endif
+        int                     i;              /* loop counter */
+        xfs_bmbt_key_t          key;            /* bmap btree key */
+        xfs_buf_t               *lbp;           /* left buffer pointer */
+        xfs_bmbt_block_t        *left;          /* left btree block */
+        xfs_bmbt_key_t          *lkp;           /* left btree key */
+        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
+        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
+        xfs_mount_t             *mp;            /* file system mount point */
+        xfs_buf_t               *rbp;           /* right buffer pointer */
+        xfs_bmbt_block_t        *right;         /* right btree block */
+        xfs_bmbt_key_t          *rkp;           /* right btree key */
+        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
+        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGI(cur, level);
+        if (level == cur->bc_nlevels - 1) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        lbp = cur->bc_bufs[level];
+        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+#endif
+        if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        mp = cur->bc_mp;
+        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+                        &rbp, XFS_BMAP_BTREE_REF))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        if (level > 0) {
+                lkp = XFS_BMAP_KEY_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                lpp = XFS_BMAP_PTR_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+                        if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+                }
+#endif
+                memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+                memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+#endif
+                *rkp = *lkp;
+                *rpp = *lpp; /* INT_: direct copy */
+                xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+                xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+        } else {
+                lrp = XFS_BMAP_REC_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+                memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                *rrp = *lrp;
+                xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+                INT_SET(key.br_startoff, ARCH_CONVERT,
+                        xfs_bmbt_disk_get_startoff(rrp));
+                rkp = &key;
+        }
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
+        INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+#ifdef DEBUG
+        if (level > 0)
+                xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
+        else
+                xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
+#endif
+        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
+        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        i = xfs_btree_lastrec(tcur, level);
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        if ((error = xfs_bmbt_increment(tcur, level, &i))) {
+                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
+                goto error1;
+        }
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
+                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
+                goto error1;
+        }
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+error1:
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+        xfs_filblks_t           blks,
+        int                     extent_flag)
+{
+        if (extent_flag) {
+                ASSERT(blks != 0);      /* saved for DMIG */
+                return XFS_EXT_UNWRITTEN;
+        }
+        return XFS_EXT_NORM;
+}
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int                                      /* error */
+xfs_bmbt_split(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        xfs_fsblock_t           *bnop,
+        xfs_bmbt_key_t          *keyp,
+        xfs_btree_cur_t         **curp,
+        int                     *stat)          /* success/failure */
+{
+        xfs_alloc_arg_t         args;           /* block allocation args */
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_split";
+#endif
+        int                     i;              /* loop counter */
+        xfs_fsblock_t           lbno;           /* left sibling block number */
+        xfs_buf_t               *lbp;           /* left buffer pointer */
+        xfs_bmbt_block_t        *left;          /* left btree block */
+        xfs_bmbt_key_t          *lkp;           /* left btree key */
+        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
+        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
+        xfs_buf_t               *rbp;           /* right buffer pointer */
+        xfs_bmbt_block_t        *right;         /* right btree block */
+        xfs_bmbt_key_t          *rkp;           /* right btree key */
+        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
+        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
+        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
+        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp);
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        lbp = cur->bc_bufs[level];
+        lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
+        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+        args.fsbno = cur->bc_private.b.firstblock;
+        if (args.fsbno == NULLFSBLOCK) {
+                args.fsbno = lbno;
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        } else if (cur->bc_private.b.flist->xbf_low)
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+        else
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        args.mod = args.minleft = args.alignment = args.total = args.isfl =
+                args.userdata = args.minalignslop = 0;
+        args.minlen = args.maxlen = args.prod = 1;
+        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return XFS_ERROR(ENOSPC);
+        }
+        if ((error = xfs_alloc_vextent(&args))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        cur->bc_private.b.ip->i_d.di_nblocks++;
+        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+                        XFS_TRANS_DQ_BCOUNT, 1L);
+        rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
+        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+#endif
+        INT_SET(right->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+        right->bb_level = left->bb_level; /* INT_: direct copy */
+        INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+        if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+            cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+                INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+        i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+        if (level > 0) {
+                lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
+                lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
+                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+                        if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) {
+                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                return error;
+                        }
+                }
+#endif
+                memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+                memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+                xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT);
+        } else {
+                lrp = XFS_BMAP_REC_IADDR(left, i, cur);
+                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+                memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp);
+        }
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+        right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+        INT_SET(left->bb_rightsib, ARCH_CONVERT, args.fsbno);
+        INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+        xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
+        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+                if ((error = xfs_btree_read_bufl(args.mp, args.tp,
+                                INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+                                XFS_BMAP_BTREE_REF))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
+                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.fsbno);
+                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+                xfs_btree_setbuf(cur, level, rbp);
+                cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        }
+        if (level + 1 < cur->bc_nlevels) {
+                if ((error = xfs_btree_dup_cursor(cur, curp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                (*curp)->bc_ptrs[level + 1]++;
+        }
+        *bnop = args.fsbno;
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Update keys for the record.
+ */
+STATIC int
+xfs_bmbt_updkey(
+        xfs_btree_cur_t         *cur,
+        xfs_bmbt_key_t          *keyp,  /* on-disk format */
+        int                     level)
+{
+        xfs_bmbt_block_t        *block;
+        xfs_buf_t               *bp;
+#ifdef DEBUG
+        int                     error;
+#endif
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_updkey";
+#endif
+        xfs_bmbt_key_t          *kp;
+        int                     ptr;
+        ASSERT(level >= 1);
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
+        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+                block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+#endif
+                ptr = cur->bc_ptrs[level];
+                kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
+                *kp = *keyp;
+                xfs_bmbt_log_keys(cur, bp, ptr, ptr);
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        return 0;
+}
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+        xfs_bmdr_block_t        *dblock,
+        int                     dblocklen,
+        xfs_bmbt_block_t        *rblock,
+        int                     rblocklen)
+{
+        int                     dmxr;
+        xfs_bmbt_key_t          *fkp;
+        xfs_bmbt_ptr_t          *fpp;
+        xfs_bmbt_key_t          *tkp;
+        xfs_bmbt_ptr_t          *tpp;
+        INT_SET(rblock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+        rblock->bb_level = dblock->bb_level;    /* both in on-disk format */
+        ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
+        rblock->bb_numrecs = dblock->bb_numrecs;/* both in on-disk format */
+        INT_SET(rblock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+        INT_SET(rblock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+        tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+        tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
+        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+        memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+}
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_bmbt_decrement(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        xfs_bmbt_block_t        *block;
+        xfs_buf_t               *bp;
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_decrement";
+#endif
+        xfs_fsblock_t           fsbno;
+        int                     lev;
+        xfs_mount_t             *mp;
+        xfs_trans_t             *tp;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        if (level < cur->bc_nlevels - 1)
+                xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+        if (--cur->bc_ptrs[level] > 0) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 1;
+                return 0;
+        }
+        block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+#endif
+        if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                if (--cur->bc_ptrs[lev] > 0)
+                        break;
+                if (lev < cur->bc_nlevels - 1)
+                        xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+        }
+        if (lev == cur->bc_nlevels) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        tp = cur->bc_tp;
+        mp = cur->bc_mp;
+        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
+                fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
+                                XFS_BMAP_BTREE_REF))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                lev--;
+                xfs_btree_setbuf(cur, lev, bp);
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Delete the record pointed to by cur.
+ */
+int                                     /* error */
+xfs_bmbt_delete(
+        xfs_btree_cur_t *cur,
+        int             *stat)          /* success/failure */
+{
+        int             error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char     fname[] = "xfs_bmbt_delete";
+#endif
+        int             i;
+        int             level;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        for (level = 0, i = 2; i == 2; level++) {
+                if ((error = xfs_bmbt_delrec(cur, level, &i))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+        }
+        if (i == 0) {
+                for (level = 1; level < cur->bc_nlevels; level++) {
+                        if (cur->bc_ptrs[level] == 0) {
+                                if ((error = xfs_bmbt_decrement(cur, level,
+                                                &i))) {
+                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                                        return error;
+                                }
+                                break;
+                        }
+                }
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = i;
+        return 0;
+}
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+STATIC __inline__ void
+__xfs_bmbt_get_all(
+                __uint64_t l0,
+                __uint64_t l1,
+                xfs_bmbt_irec_t *s)
+{
+        int     ext_flag;
+        xfs_exntst_t st;
+        ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+        s->br_startoff = ((xfs_fileoff_t)l0 &
+                           XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+#if XFS_BIG_BLKNOS
+        s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
+                           (((xfs_fsblock_t)l1) >> 21);
+#else
+#ifdef DEBUG
+        {
+                xfs_dfsbno_t    b;
+                b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
+                    (((xfs_dfsbno_t)l1) >> 21);
+                ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+                s->br_startblock = (xfs_fsblock_t)b;
+        }
+#else   /* !DEBUG */
+        s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
+#endif  /* DEBUG */
+#endif  /* XFS_BIG_BLKNOS */
+        s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
+        /* This is xfs_extent_state() in-line */
+        if (ext_flag) {
+                ASSERT(s->br_blockcount != 0);  /* saved for DMIG */
+                st = XFS_EXT_UNWRITTEN;
+        } else
+                st = XFS_EXT_NORM;
+        s->br_state = st;
+}
+void
+xfs_bmbt_get_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s)
+{
+        __xfs_bmbt_get_all(r->l0, r->l1, s);
+}
+/*
+ * Get the block pointer for the given level of the cursor.
+ * Fill in the buffer pointer, if applicable.
+ */
+xfs_bmbt_block_t *
+xfs_bmbt_get_block(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        xfs_buf_t               **bpp)
+{
+        xfs_ifork_t             *ifp;
+        xfs_bmbt_block_t        *rval;
+        if (level < cur->bc_nlevels - 1) {
+                *bpp = cur->bc_bufs[level];
+                rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
+        } else {
+                *bpp = NULL;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                        cur->bc_private.b.whichfork);
+                rval = ifp->if_broot;
+        }
+        return rval;
+}
+/*
+ * Extract the blockcount field from an in memory bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+        xfs_bmbt_rec_t  *r)
+{
+        return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21));
+}
+/*
+ * Extract the startblock field from an in memory bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+        xfs_bmbt_rec_t  *r)
+{
+#if XFS_BIG_BLKNOS
+        return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+               (((xfs_fsblock_t)r->l1) >> 21);
+#else
+#ifdef DEBUG
+        xfs_dfsbno_t    b;
+        b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+            (((xfs_dfsbno_t)r->l1) >> 21);
+        ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+        return (xfs_fsblock_t)b;
+#else   /* !DEBUG */
+        return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
+#endif  /* DEBUG */
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Extract the startoff field from an in memory bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+        xfs_bmbt_rec_t  *r)
+{
+        return ((xfs_fileoff_t)r->l0 &
+                 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+xfs_exntst_t
+xfs_bmbt_get_state(
+        xfs_bmbt_rec_t  *r)
+{
+        int     ext_flag;
+        ext_flag = (int)((r->l0) >> (64 - BMBT_EXNTFLAG_BITLEN));
+        return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+                                ext_flag);
+}
+#if __BYTE_ORDER != __BIG_ENDIAN
+/* Endian flipping versions of the bmbt extraction functions */
+void
+xfs_bmbt_disk_get_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s)
+{
+        __uint64_t      l0, l1;
+        l0 = INT_GET(r->l0, ARCH_CONVERT);
+        l1 = INT_GET(r->l1, ARCH_CONVERT);
+        __xfs_bmbt_get_all(l0, l1, s);
+}
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+        xfs_bmbt_rec_t  *r)
+{
+        return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21));
+}
+/*
+ * Extract the startblock field from an on disk bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_disk_get_startblock(
+        xfs_bmbt_rec_t  *r)
+{
+#if XFS_BIG_BLKNOS
+        return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
+               (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+#else
+#ifdef DEBUG
+        xfs_dfsbno_t    b;
+        b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
+            (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+        ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+        return (xfs_fsblock_t)b;
+#else   /* !DEBUG */
+        return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+#endif  /* DEBUG */
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+        xfs_bmbt_rec_t  *r)
+{
+        return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) &
+                 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+xfs_exntst_t
+xfs_bmbt_disk_get_state(
+        xfs_bmbt_rec_t  *r)
+{
+        int     ext_flag;
+        ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN));
+        return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r),
+                                ext_flag);
+}
+#endif
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_bmbt_increment(
+        xfs_btree_cur_t         *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        xfs_bmbt_block_t        *block;
+        xfs_buf_t               *bp;
+        int                     error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_increment";
+#endif
+        xfs_fsblock_t           fsbno;
+        int                     lev;
+        xfs_mount_t             *mp;
+        xfs_trans_t             *tp;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        if (level < cur->bc_nlevels - 1)
+                xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+#endif
+        if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 1;
+                return 0;
+        }
+        if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                block = xfs_bmbt_get_block(cur, lev, &bp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+#endif
+                if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+                        break;
+                if (lev < cur->bc_nlevels - 1)
+                        xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+        }
+        if (lev == cur->bc_nlevels) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        tp = cur->bc_tp;
+        mp = cur->bc_mp;
+        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
+                fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
+                                XFS_BMAP_BTREE_REF))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                lev--;
+                xfs_btree_setbuf(cur, lev, bp);
+                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                cur->bc_ptrs[lev] = 1;
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Insert the current record at the point referenced by cur.
+ */
+int                                     /* error */
+xfs_bmbt_insert(
+        xfs_btree_cur_t *cur,
+        int             *stat)          /* success/failure */
+{
+        int             error;          /* error return value */
+#ifdef XFS_BMBT_TRACE
+        static char     fname[] = "xfs_bmbt_insert";
+#endif
+        int             i;
+        int             level;
+        xfs_fsblock_t   nbno;
+        xfs_btree_cur_t *ncur;
+        xfs_bmbt_rec_t  nrec;
+        xfs_btree_cur_t *pcur;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        level = 0;
+        nbno = NULLFSBLOCK;
+        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
+        ncur = (xfs_btree_cur_t *)0;
+        pcur = cur;
+        do {
+                if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
+                                &i))) {
+                        if (pcur != cur)
+                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+                        cur->bc_nlevels = pcur->bc_nlevels;
+                        cur->bc_private.b.allocated +=
+                                pcur->bc_private.b.allocated;
+                        pcur->bc_private.b.allocated = 0;
+                        ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
+                               (cur->bc_private.b.ip->i_d.di_flags &
+                                XFS_DIFLAG_REALTIME));
+                        cur->bc_private.b.firstblock =
+                                pcur->bc_private.b.firstblock;
+                        ASSERT(cur->bc_private.b.flist ==
+                               pcur->bc_private.b.flist);
+                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+                }
+                if (ncur) {
+                        pcur = ncur;
+                        ncur = (xfs_btree_cur_t *)0;
+                }
+        } while (nbno != NULLFSBLOCK);
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+        return error;
+}
+/*
+ * Log fields from the btree block header.
+ */
+void
+xfs_bmbt_log_block(
+        xfs_btree_cur_t         *cur,
+        xfs_buf_t               *bp,
+        int                     fields)
+{
+        int                     first;
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_log_block";
+#endif
+        int                     last;
+        xfs_trans_t             *tp;
+        static const short      offsets[] = {
+                offsetof(xfs_bmbt_block_t, bb_magic),
+                offsetof(xfs_bmbt_block_t, bb_level),
+                offsetof(xfs_bmbt_block_t, bb_numrecs),
+                offsetof(xfs_bmbt_block_t, bb_leftsib),
+                offsetof(xfs_bmbt_block_t, bb_rightsib),
+                sizeof(xfs_bmbt_block_t)
+        };
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
+        tp = cur->bc_tp;
+        if (bp) {
+                xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
+                                  &last);
+                xfs_trans_log_buf(tp, bp, first, last);
+        } else
+                xfs_trans_log_inode(tp, cur->bc_private.b.ip,
+                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_bmbt_log_recs(
+        xfs_btree_cur_t         *cur,
+        xfs_buf_t               *bp,
+        int                     rfirst,
+        int                     rlast)
+{
+        xfs_bmbt_block_t        *block;
+        int                     first;
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_log_recs";
+#endif
+        int                     last;
+        xfs_bmbt_rec_t          *rp;
+        xfs_trans_t             *tp;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
+        ASSERT(bp);
+        tp = cur->bc_tp;
+        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+        rp = XFS_BMAP_REC_DADDR(block, 1, cur);
+        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+        xfs_trans_log_buf(tp, bp, first, last);
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+int                                     /* error */
+xfs_bmbt_lookup_eq(
+        xfs_btree_cur_t *cur,
+        xfs_fileoff_t   off,
+        xfs_fsblock_t   bno,
+        xfs_filblks_t   len,
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+int                                     /* error */
+xfs_bmbt_lookup_ge(
+        xfs_btree_cur_t *cur,
+        xfs_fileoff_t   off,
+        xfs_fsblock_t   bno,
+        xfs_filblks_t   len,
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+int                                     /* error */
+xfs_bmbt_lookup_le(
+        xfs_btree_cur_t *cur,
+        xfs_fileoff_t   off,
+        xfs_fsblock_t   bno,
+        xfs_filblks_t   len,
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_bmbt_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Give the bmap btree a new root block.  Copy the old broot contents
+ * down into a real block and make the broot point to it.
+ */
+int                                             /* error */
+xfs_bmbt_newroot(
+        xfs_btree_cur_t         *cur,           /* btree cursor */
+        int                     *logflags,      /* logging flags for inode */
+        int                     *stat)          /* return status - 0 fail */
+{
+        xfs_alloc_arg_t         args;           /* allocation arguments */
+        xfs_bmbt_block_t        *block;         /* bmap btree block */
+        xfs_buf_t               *bp;            /* buffer for block */
+        xfs_bmbt_block_t        *cblock;        /* child btree block */
+        xfs_bmbt_key_t          *ckp;           /* child key pointer */
+        xfs_bmbt_ptr_t          *cpp;           /* child ptr pointer */
+        int                     error;          /* error return code */
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_newroot";
+#endif
+#ifdef DEBUG
+        int                     i;              /* loop counter */
+#endif
+        xfs_bmbt_key_t          *kp;            /* pointer to bmap btree key */
+        int                     level;          /* btree level */
+        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        level = cur->bc_nlevels - 1;
+        block = xfs_bmbt_get_block(cur, level, &bp);
+        /*
+         * Copy the root into a real block.
+         */
+        args.mp = cur->bc_mp;
+        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+        args.tp = cur->bc_tp;
+        args.fsbno = cur->bc_private.b.firstblock;
+        args.mod = args.minleft = args.alignment = args.total = args.isfl =
+                args.userdata = args.minalignslop = 0;
+        args.minlen = args.maxlen = args.prod = 1;
+        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+        if (args.fsbno == NULLFSBLOCK) {
+#ifdef DEBUG
+                if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+#endif
+                args.fsbno = INT_GET(*pp, ARCH_CONVERT);
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        } else if (args.wasdel)
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+        else
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        if ((error = xfs_alloc_vextent(&args))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        cur->bc_private.b.ip->i_d.di_nblocks++;
+        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+                          XFS_TRANS_DQ_BCOUNT, 1L);
+        bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
+        cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
+        *cblock = *block;
+        INT_MOD(block->bb_level, ARCH_CONVERT, +1);
+        INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+        cur->bc_nlevels++;
+        cur->bc_ptrs[level + 1] = 1;
+        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
+        memcpy(ckp, kp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
+        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
+#ifdef DEBUG
+        for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
+                if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                        return error;
+                }
+        }
+#endif
+        memcpy(cpp, pp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno,
+                        level))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+#endif
+        INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+        xfs_iroot_realloc(cur->bc_private.b.ip, 1 - INT_GET(cblock->bb_numrecs, ARCH_CONVERT),
+                cur->bc_private.b.whichfork);
+        xfs_btree_setbuf(cur, level, bp);
+        /*
+         * Do all this logging at the end so that
+         * the root is at the right level.
+         */
+        xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
+        xfs_bmbt_log_keys(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+        xfs_bmbt_log_ptrs(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        *logflags |=
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s)
+{
+        int     extent_flag;
+        ASSERT((s->br_state == XFS_EXT_NORM) ||
+                (s->br_state == XFS_EXT_UNWRITTEN));
+        extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
+        ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
+        ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
+#if XFS_BIG_BLKNOS
+        ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
+        r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                 ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+                 ((xfs_bmbt_rec_base_t)s->br_startblock >> 43);
+        r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+                 ((xfs_bmbt_rec_base_t)s->br_blockcount &
+                 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+#else   /* !XFS_BIG_BLKNOS */
+        if (ISNULLSTARTBLOCK(s->br_startblock)) {
+                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+                          (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+                r->l1 = XFS_MASK64HI(11) |
+                          ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+                          ((xfs_bmbt_rec_base_t)s->br_blockcount &
+                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+        } else {
+                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)s->br_startoff << 9);
+                r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+                          ((xfs_bmbt_rec_base_t)s->br_blockcount &
+                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+        xfs_bmbt_rec_t  *r,
+        xfs_fileoff_t   o,
+        xfs_fsblock_t   b,
+        xfs_filblks_t   c,
+        xfs_exntst_t    v)
+{
+        int     extent_flag;
+        ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
+        extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
+        ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
+        ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+#if XFS_BIG_BLKNOS
+        ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+        r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                ((xfs_bmbt_rec_base_t)o << 9) |
+                ((xfs_bmbt_rec_base_t)b >> 43);
+        r->l1 = ((xfs_bmbt_rec_base_t)b << 21) |
+                ((xfs_bmbt_rec_base_t)c &
+                (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+#else   /* !XFS_BIG_BLKNOS */
+        if (ISNULLSTARTBLOCK(b)) {
+                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)o << 9) |
+                         (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+                r->l1 = XFS_MASK64HI(11) |
+                          ((xfs_bmbt_rec_base_t)b << 21) |
+                          ((xfs_bmbt_rec_base_t)c &
+                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+        } else {
+                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)o << 9);
+                r->l1 = ((xfs_bmbt_rec_base_t)b << 21) |
+                         ((xfs_bmbt_rec_base_t)c &
+                         (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+#if __BYTE_ORDER != __BIG_ENDIAN
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_disk_set_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s)
+{
+        int     extent_flag;
+        ASSERT((s->br_state == XFS_EXT_NORM) ||
+                (s->br_state == XFS_EXT_UNWRITTEN));
+        extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
+        ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
+        ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
+#if XFS_BIG_BLKNOS
+        ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
+        INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                  ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+                  ((xfs_bmbt_rec_base_t)s->br_startblock >> 43));
+        INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+                  ((xfs_bmbt_rec_base_t)s->br_blockcount &
+                   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+#else   /* !XFS_BIG_BLKNOS */
+        if (ISNULLSTARTBLOCK(s->br_startblock)) {
+                INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+                          (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+                INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
+                          ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+                          ((xfs_bmbt_rec_base_t)s->br_blockcount &
+                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+        } else {
+                INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)s->br_startoff << 9));
+                INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+                          ((xfs_bmbt_rec_base_t)s->br_blockcount &
+                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Set all the fields in a disk format bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_disk_set_allf(
+        xfs_bmbt_rec_t  *r,
+        xfs_fileoff_t   o,
+        xfs_fsblock_t   b,
+        xfs_filblks_t   c,
+        xfs_exntst_t    v)
+{
+        int     extent_flag;
+        ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
+        extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
+        ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
+        ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+#if XFS_BIG_BLKNOS
+        ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+        INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                ((xfs_bmbt_rec_base_t)o << 9) |
+                ((xfs_bmbt_rec_base_t)b >> 43));
+        INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
+                  ((xfs_bmbt_rec_base_t)c &
+                   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+#else   /* !XFS_BIG_BLKNOS */
+        if (ISNULLSTARTBLOCK(b)) {
+                INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)o << 9) |
+                         (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+                INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
+                          ((xfs_bmbt_rec_base_t)b << 21) |
+                          ((xfs_bmbt_rec_base_t)c &
+                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+        } else {
+                INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+                        ((xfs_bmbt_rec_base_t)o << 9));
+                INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
+                          ((xfs_bmbt_rec_base_t)c &
+                           (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+#endif
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+        xfs_bmbt_rec_t  *r,
+        xfs_filblks_t   v)
+{
+        ASSERT((v & XFS_MASK64HI(43)) == 0);
+        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
+                  (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21));
+}
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+        xfs_bmbt_rec_t  *r,
+        xfs_fsblock_t   v)
+{
+#if XFS_BIG_BLKNOS
+        ASSERT((v & XFS_MASK64HI(12)) == 0);
+        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
+                  (xfs_bmbt_rec_base_t)(v >> 43);
+        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
+                  (xfs_bmbt_rec_base_t)(v << 21);
+#else   /* !XFS_BIG_BLKNOS */
+        if (ISNULLSTARTBLOCK(v)) {
+                r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+                r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
+                          ((xfs_bmbt_rec_base_t)v << 21) |
+                          (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+        } else {
+                r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+                r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
+                          (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+        }
+#endif  /* XFS_BIG_BLKNOS */
+}
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+        xfs_bmbt_rec_t  *r,
+        xfs_fileoff_t   v)
+{
+        ASSERT((v & XFS_MASK64HI(9)) == 0);
+        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
+                ((xfs_bmbt_rec_base_t)v << 9) |
+                  (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+}
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+        xfs_bmbt_rec_t  *r,
+        xfs_exntst_t    v)
+{
+        ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+        if (v == XFS_EXT_NORM)
+                r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN);
+        else
+                r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN);
+}
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+        xfs_bmbt_block_t        *rblock,
+        int                     rblocklen,
+        xfs_bmdr_block_t        *dblock,
+        int                     dblocklen)
+{
+        int                     dmxr;
+        xfs_bmbt_key_t          *fkp;
+        xfs_bmbt_ptr_t          *fpp;
+        xfs_bmbt_key_t          *tkp;
+        xfs_bmbt_ptr_t          *tpp;
+        ASSERT(INT_GET(rblock->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC);
+        ASSERT(INT_GET(rblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
+        ASSERT(INT_GET(rblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
+        ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
+        dblock->bb_level = rblock->bb_level;    /* both in on-disk format */
+        dblock->bb_numrecs = rblock->bb_numrecs;/* both in on-disk format */
+        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+        fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+        dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
+        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
+        memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+}
+/*
+ * Update the record to the passed values.
+ */
+int
+xfs_bmbt_update(
+        xfs_btree_cur_t         *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        xfs_exntst_t            state)
+{
+        xfs_bmbt_block_t        *block;
+        xfs_buf_t               *bp;
+        int                     error;
+#ifdef XFS_BMBT_TRACE
+        static char             fname[] = "xfs_bmbt_update";
+#endif
+        xfs_bmbt_key_t          key;
+        int                     ptr;
+        xfs_bmbt_rec_t          *rp;
+        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+        XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
+                (xfs_dfilblks_t)len, (int)state);
+        block = xfs_bmbt_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+#endif
+        ptr = cur->bc_ptrs[0];
+        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+        xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
+        xfs_bmbt_log_recs(cur, bp, ptr, ptr);
+        if (ptr > 1) {
+                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+                return 0;
+        }
+        INT_SET(key.br_startoff, ARCH_CONVERT, off);
+        if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
+                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+                return error;
+        }
+        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+        return 0;
+}
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+int
+xfs_check_nostate_extents(
+        xfs_bmbt_rec_t          *ep,
+        xfs_extnum_t            num)
+{
+        for (; num > 0; num--, ep++) {
+                if ((ep->l0 >>
+                     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+                        ASSERT(0);
+                        return 1;
+                }
+        }
+        return 0;
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
new file mode 100644
index 000000000000..843ff12b4bf2
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2000,2002-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+#define XFS_BMAP_MAGIC  0x424d4150      /* 'BMAP' */
+struct xfs_btree_cur;
+struct xfs_btree_lblock;
+struct xfs_mount;
+struct xfs_inode;
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block
+{
+        __uint16_t      bb_level;       /* 0 is a leaf */
+        __uint16_t      bb_numrecs;     /* current # of data records */
+} xfs_bmdr_block_t;
+/*
+ * Bmap btree record and extent descriptor.
+ * For 32-bit kernels,
+ *  l0:31 is an extent flag (value 1 indicates non-normal).
+ *  l0:0-30 and l1:9-31 are startoff.
+ *  l1:0-8, l2:0-31, and l3:21-31 are startblock.
+ *  l3:0-20 are blockcount.
+ * For 64-bit kernels,
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define BMBT_TOTAL_BITLEN       128     /* 128 bits, 16 bytes */
+#define BMBT_EXNTFLAG_BITOFF    0
+#define BMBT_EXNTFLAG_BITLEN    1
+#define BMBT_STARTOFF_BITOFF    (BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN)
+#define BMBT_STARTOFF_BITLEN    54
+#define BMBT_STARTBLOCK_BITOFF  (BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN)
+#define BMBT_STARTBLOCK_BITLEN  52
+#define BMBT_BLOCKCOUNT_BITOFF  \
+        (BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN)
+#define BMBT_BLOCKCOUNT_BITLEN  (BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF)
+#else
+#define BMBT_TOTAL_BITLEN       128     /* 128 bits, 16 bytes */
+#define BMBT_EXNTFLAG_BITOFF    63
+#define BMBT_EXNTFLAG_BITLEN    1
+#define BMBT_STARTOFF_BITOFF    (BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN)
+#define BMBT_STARTOFF_BITLEN    54
+#define BMBT_STARTBLOCK_BITOFF  85 /* 128 - 43 (other 9 is in first word) */
+#define BMBT_STARTBLOCK_BITLEN  52
+#define BMBT_BLOCKCOUNT_BITOFF  64 /* Start of second 64 bit container */
+#define BMBT_BLOCKCOUNT_BITLEN  21
+#endif
+#define BMBT_USE_64     1
+typedef struct xfs_bmbt_rec_32
+{
+        __uint32_t              l0, l1, l2, l3;
+} xfs_bmbt_rec_32_t;
+typedef struct xfs_bmbt_rec_64
+{
+        __uint64_t              l0, l1;
+} xfs_bmbt_rec_64_t;
+typedef __uint64_t      xfs_bmbt_rec_base_t;    /* use this for casts */
+typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t;
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS       17
+#define STARTBLOCKMASKBITS      (15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS     (15 + 20)
+#define STARTBLOCKMASK          \
+        (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK         \
+        (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLSTARTBLOCK)
+int isnullstartblock(xfs_fsblock_t x);
+#define ISNULLSTARTBLOCK(x)     isnullstartblock(x)
+#else
+#define ISNULLSTARTBLOCK(x)     (((x) & STARTBLOCKMASK) == STARTBLOCKMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLDSTARTBLOCK)
+int isnulldstartblock(xfs_dfsbno_t x);
+#define ISNULLDSTARTBLOCK(x)    isnulldstartblock(x)
+#else
+#define ISNULLDSTARTBLOCK(x)    (((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_NULLSTARTBLOCK)
+xfs_fsblock_t nullstartblock(int k);
+#define NULLSTARTBLOCK(k)       nullstartblock(k)
+#else
+#define NULLSTARTBLOCK(k)       \
+        ((ASSERT(k < (1 << STARTBLOCKVALBITS))), (STARTBLOCKMASK | (k)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_STARTBLOCKVAL)
+xfs_filblks_t startblockval(xfs_fsblock_t x);
+#define STARTBLOCKVAL(x)        startblockval(x)
+#else
+#define STARTBLOCKVAL(x)        ((xfs_filblks_t)((x) & ~STARTBLOCKMASK))
+#endif
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+        XFS_EXTFMT_NOSTATE = 0,
+        XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+/*
+ * Possible extent states.
+ */
+typedef enum {
+        XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+        XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+/*
+ * Extent state and extent format macros.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTFMT_INODE )
+xfs_exntfmt_t xfs_extfmt_inode(struct xfs_inode *ip);
+#define XFS_EXTFMT_INODE(x)     xfs_extfmt_inode(x)
+#else
+#define XFS_EXTFMT_INODE(x) \
+  (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \
+        XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#endif
+#define ISUNWRITTEN(x)  ((x)->br_state == XFS_EXT_UNWRITTEN)
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+        xfs_fileoff_t   br_startoff;    /* starting file offset */
+        xfs_fsblock_t   br_startblock;  /* starting block number */
+        xfs_filblks_t   br_blockcount;  /* number of blocks */
+        xfs_exntst_t    br_state;       /* extent state */
+} xfs_bmbt_irec_t;
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key
+{
+        xfs_dfiloff_t   br_startoff;    /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;    /* btree pointer type */
+                                        /* btree block header type */
+typedef struct xfs_btree_lblock xfs_bmbt_block_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
+xfs_bmbt_block_t *xfs_buf_to_bmbt_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_BMBT_BLOCK(bp)               xfs_buf_to_bmbt_block(bp)
+#else
+#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
+int xfs_bmap_rblock_dsize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_RBLOCK_DSIZE(lev,cur)          xfs_bmap_rblock_dsize(lev,cur)
+#else
+#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
+int xfs_bmap_rblock_isize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_RBLOCK_ISIZE(lev,cur)          xfs_bmap_rblock_isize(lev,cur)
+#else
+#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
+        ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
+                            (cur)->bc_private.b.whichfork)->if_broot_bytes)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_IBLOCK_SIZE)
+int xfs_bmap_iblock_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_IBLOCK_SIZE(lev,cur)           xfs_bmap_iblock_size(lev,cur)
+#else
+#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DSIZE)
+int xfs_bmap_block_dsize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DSIZE(lev,cur)           xfs_bmap_block_dsize(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \
+        ((lev) == (cur)->bc_nlevels - 1 ? \
+                XFS_BMAP_RBLOCK_DSIZE(lev,cur) : \
+                XFS_BMAP_IBLOCK_SIZE(lev,cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_ISIZE)
+int xfs_bmap_block_isize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_ISIZE(lev,cur)           xfs_bmap_block_isize(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \
+        ((lev) == (cur)->bc_nlevels - 1 ? \
+                XFS_BMAP_RBLOCK_ISIZE(lev,cur) : \
+                XFS_BMAP_IBLOCK_SIZE(lev,cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
+int xfs_bmap_block_dmaxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur)        xfs_bmap_block_dmaxrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
+        ((lev) == (cur)->bc_nlevels - 1 ? \
+                XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
+                        xfs_bmdr, (lev) == 0) : \
+                ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
+int xfs_bmap_block_imaxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur)        xfs_bmap_block_imaxrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
+        ((lev) == (cur)->bc_nlevels - 1 ? \
+                XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
+                        xfs_bmbt, (lev) == 0) : \
+                ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
+int xfs_bmap_block_dminrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DMINRECS(lev,cur)        xfs_bmap_block_dminrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
+        ((lev) == (cur)->bc_nlevels - 1 ? \
+                XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
+                        xfs_bmdr, (lev) == 0) : \
+                ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
+int xfs_bmap_block_iminrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_IMINRECS(lev,cur)        xfs_bmap_block_iminrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
+        ((lev) == (cur)->bc_nlevels - 1 ? \
+                XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
+                        xfs_bmbt, (lev) == 0) : \
+                ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_DADDR)
+xfs_bmbt_rec_t *
+xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_REC_DADDR(bb,i,cur)            xfs_bmap_rec_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_REC_DADDR(bb,i,cur) \
+        XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE(                \
+                INT_GET((bb)->bb_level, ARCH_CONVERT), cur),    \
+                xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(       \
+                        INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_IADDR)
+xfs_bmbt_rec_t *
+xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_REC_IADDR(bb,i,cur)            xfs_bmap_rec_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_REC_IADDR(bb,i,cur) \
+        XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE(                \
+                INT_GET((bb)->bb_level, ARCH_CONVERT), cur),    \
+                xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(       \
+                        INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_DADDR)
+xfs_bmbt_key_t *
+xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_KEY_DADDR(bb,i,cur)            xfs_bmap_key_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_KEY_DADDR(bb,i,cur) \
+        XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE(                \
+                INT_GET((bb)->bb_level, ARCH_CONVERT), cur),    \
+                xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(       \
+                        INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_IADDR)
+xfs_bmbt_key_t *
+xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_KEY_IADDR(bb,i,cur)            xfs_bmap_key_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_KEY_IADDR(bb,i,cur) \
+        XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE(                \
+                INT_GET((bb)->bb_level, ARCH_CONVERT), cur),    \
+                xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(       \
+                        INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_DADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_PTR_DADDR(bb,i,cur)            xfs_bmap_ptr_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_PTR_DADDR(bb,i,cur) \
+        XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE(                \
+                INT_GET((bb)->bb_level, ARCH_CONVERT), cur),    \
+                xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(       \
+                        INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_IADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_PTR_IADDR(bb,i,cur)            xfs_bmap_ptr_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_PTR_IADDR(bb,i,cur) \
+        XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE(                \
+                INT_GET((bb)->bb_level, ARCH_CONVERT), cur),    \
+                xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(       \
+                        INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
+xfs_bmbt_rec_t *xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz)        xfs_bmap_broot_rec_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
+        XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
+xfs_bmbt_key_t *xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz)        xfs_bmap_broot_key_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
+        XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
+xfs_bmbt_ptr_t *xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz)        xfs_bmap_broot_ptr_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
+        XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_NUMRECS)
+int xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb);
+#define XFS_BMAP_BROOT_NUMRECS(bb)              xfs_bmap_broot_numrecs(bb)
+#else
+#define XFS_BMAP_BROOT_NUMRECS(bb) (INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_MAXRECS)
+int xfs_bmap_broot_maxrecs(int sz);
+#define XFS_BMAP_BROOT_MAXRECS(sz)              xfs_bmap_broot_maxrecs(sz)
+#else
+#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
+int xfs_bmap_broot_space_calc(int nrecs);
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs)        xfs_bmap_broot_space_calc(nrecs)
+#else
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
+        ((int)(sizeof(xfs_bmbt_block_t) + \
+               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE)
+int xfs_bmap_broot_space(xfs_bmdr_block_t *bb);
+#define XFS_BMAP_BROOT_SPACE(bb)                xfs_bmap_broot_space(bb)
+#else
+#define XFS_BMAP_BROOT_SPACE(bb) \
+        XFS_BMAP_BROOT_SPACE_CALC(INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMDR_SPACE_CALC)
+int xfs_bmdr_space_calc(int nrecs);
+#define XFS_BMDR_SPACE_CALC(nrecs)              xfs_bmdr_space_calc(nrecs)
+#else
+#define XFS_BMDR_SPACE_CALC(nrecs)      \
+        ((int)(sizeof(xfs_bmdr_block_t) + \
+               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
+#endif
+/*
+ * Maximum number of bmap btree levels.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BM_MAXLEVELS)
+int xfs_bm_maxlevels(struct xfs_mount *mp, int w);
+#define XFS_BM_MAXLEVELS(mp,w)                  xfs_bm_maxlevels(mp,w)
+#else
+#define XFS_BM_MAXLEVELS(mp,w)          ((mp)->m_bm_maxlevels[w])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_SANITY_CHECK)
+int xfs_bmap_sanity_check(struct xfs_mount *mp, xfs_bmbt_block_t *bb,
+        int level);
+#define XFS_BMAP_SANITY_CHECK(mp,bb,level)      \
+        xfs_bmap_sanity_check(mp,bb,level)
+#else
+#define XFS_BMAP_SANITY_CHECK(mp,bb,level)      \
+        (INT_GET((bb)->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC && \
+         INT_GET((bb)->bb_level, ARCH_CONVERT) == level && \
+         INT_GET((bb)->bb_numrecs, ARCH_CONVERT) > 0 && \
+         INT_GET((bb)->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0])
+#endif
+#ifdef __KERNEL__
+#if defined(XFS_BMBT_TRACE)
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BMBT_KTRACE_ARGBI   1
+#define XFS_BMBT_KTRACE_ARGBII  2
+#define XFS_BMBT_KTRACE_ARGFFFI 3
+#define XFS_BMBT_KTRACE_ARGI    4
+#define XFS_BMBT_KTRACE_ARGIFK  5
+#define XFS_BMBT_KTRACE_ARGIFR  6
+#define XFS_BMBT_KTRACE_ARGIK   7
+#define XFS_BMBT_KTRACE_CUR     8
+#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
+extern ktrace_t *xfs_bmbt_trace_buf;
+#endif
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+void
+xfs_bmdr_to_bmbt(
+        xfs_bmdr_block_t *,
+        int,
+        xfs_bmbt_block_t *,
+        int);
+int
+xfs_bmbt_decrement(
+        struct xfs_btree_cur *,
+        int,
+        int *);
+int
+xfs_bmbt_delete(
+        struct xfs_btree_cur *,
+        int *);
+void
+xfs_bmbt_get_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s);
+xfs_bmbt_block_t *
+xfs_bmbt_get_block(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        struct xfs_buf          **bpp);
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+        xfs_bmbt_rec_t  *r);
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+        xfs_bmbt_rec_t  *r);
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+        xfs_bmbt_rec_t  *r);
+xfs_exntst_t
+xfs_bmbt_get_state(
+        xfs_bmbt_rec_t  *r);
+#if __BYTE_ORDER != __BIG_ENDIAN
+void
+xfs_bmbt_disk_get_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s);
+xfs_exntst_t
+xfs_bmbt_disk_get_state(
+        xfs_bmbt_rec_t  *r);
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+        xfs_bmbt_rec_t  *r);
+xfs_fsblock_t
+xfs_bmbt_disk_get_startblock(
+        xfs_bmbt_rec_t  *r);
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+        xfs_bmbt_rec_t  *r);
+#else
+#define xfs_bmbt_disk_get_all(r, s) \
+        xfs_bmbt_get_all(r, s)
+#define xfs_bmbt_disk_get_state(r) \
+        xfs_bmbt_get_state(r)
+#define xfs_bmbt_disk_get_blockcount(r) \
+        xfs_bmbt_get_blockcount(r)
+#define xfs_bmbt_disk_get_startblock(r) \
+        xfs_bmbt_get_blockcount(r)
+#define xfs_bmbt_disk_get_startoff(r) \
+        xfs_bmbt_get_startoff(r)
+#endif
+int
+xfs_bmbt_increment(
+        struct xfs_btree_cur *,
+        int,
+        int *);
+int
+xfs_bmbt_insert(
+        struct xfs_btree_cur *,
+        int *);
+void
+xfs_bmbt_log_block(
+        struct xfs_btree_cur *,
+        struct xfs_buf *,
+        int);
+void
+xfs_bmbt_log_recs(
+        struct xfs_btree_cur *,
+        struct xfs_buf *,
+        int,
+        int);
+int
+xfs_bmbt_lookup_eq(
+        struct xfs_btree_cur *,
+        xfs_fileoff_t,
+        xfs_fsblock_t,
+        xfs_filblks_t,
+        int *);
+int
+xfs_bmbt_lookup_ge(
+        struct xfs_btree_cur *,
+        xfs_fileoff_t,
+        xfs_fsblock_t,
+        xfs_filblks_t,
+        int *);
+int
+xfs_bmbt_lookup_le(
+        struct xfs_btree_cur *,
+        xfs_fileoff_t,
+        xfs_fsblock_t,
+        xfs_filblks_t,
+        int *);
+/*
+ * Give the bmap btree a new root block.  Copy the old broot contents
+ * down into a real block and make the broot point to it.
+ */
+int                                             /* error */
+xfs_bmbt_newroot(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     *logflags,      /* logging flags for inode */
+        int                     *stat);         /* return status - 0 fail */
+void
+xfs_bmbt_set_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s);
+void
+xfs_bmbt_set_allf(
+        xfs_bmbt_rec_t  *r,
+        xfs_fileoff_t   o,
+        xfs_fsblock_t   b,
+        xfs_filblks_t   c,
+        xfs_exntst_t    v);
+void
+xfs_bmbt_set_blockcount(
+        xfs_bmbt_rec_t  *r,
+        xfs_filblks_t   v);
+void
+xfs_bmbt_set_startblock(
+        xfs_bmbt_rec_t  *r,
+        xfs_fsblock_t   v);
+void
+xfs_bmbt_set_startoff(
+        xfs_bmbt_rec_t  *r,
+        xfs_fileoff_t   v);
+void
+xfs_bmbt_set_state(
+        xfs_bmbt_rec_t  *r,
+        xfs_exntst_t    v);
+#if __BYTE_ORDER != __BIG_ENDIAN
+void
+xfs_bmbt_disk_set_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s);
+void
+xfs_bmbt_disk_set_allf(
+        xfs_bmbt_rec_t  *r,
+        xfs_fileoff_t   o,
+        xfs_fsblock_t   b,
+        xfs_filblks_t   c,
+        xfs_exntst_t    v);
+#else
+#define xfs_bmbt_disk_set_all(r, s) \
+        xfs_bmbt_set_all(r, s)
+#define xfs_bmbt_disk_set_allf(r, o, b, c, v) \
+        xfs_bmbt_set_allf(r, o, b, c, v)
+#endif
+void
+xfs_bmbt_to_bmdr(
+        xfs_bmbt_block_t *,
+        int,
+        xfs_bmdr_block_t *,
+        int);
+int
+xfs_bmbt_update(
+        struct xfs_btree_cur *,
+        xfs_fileoff_t,
+        xfs_fsblock_t,
+        xfs_filblks_t,
+        xfs_exntst_t);
+#ifdef DEBUG
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_bmbt_get_rec(
+        struct xfs_btree_cur *,
+        xfs_fileoff_t *,
+        xfs_fsblock_t *,
+        xfs_filblks_t *,
+        xfs_exntst_t *,
+        int *);
+#endif
+/*
+ * Search an extent list for the extent which includes block
+ * bno.
+ */
+xfs_bmbt_rec_t *
+xfs_bmap_do_search_extents(
+        xfs_bmbt_rec_t *,
+        xfs_extnum_t,
+        xfs_extnum_t,
+        xfs_fileoff_t,
+        int *,
+        xfs_extnum_t *,
+        xfs_bmbt_irec_t *,
+        xfs_bmbt_irec_t *);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
new file mode 100644
index 000000000000..9dd22dd95487
--- /dev/null
+++ b/fs/xfs/xfs_btree.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * This file contains common code for the space manager's btree implementations.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_error.h"
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t     *xfs_btree_cur_zone;
+/*
+ * Btree magic numbers.
+ */
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
+{
+        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+};
+/*
+ * Prototypes for internal routines.
+ */
+/*
+ * Checking routine: return maxrecs for the block.
+ */
+STATIC int                              /* number of records fitting in block */
+xfs_btree_maxrecs(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_block_t       *block);/* generic btree block pointer */
+/*
+ * Internal routines.
+ */
+/*
+ * Checking routine: return maxrecs for the block.
+ */
+STATIC int                              /* number of records fitting in block */
+xfs_btree_maxrecs(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_block_t       *block) /* generic btree block pointer */
+{
+        switch (cur->bc_btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                return (int)XFS_ALLOC_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+        case XFS_BTNUM_BMAP:
+                return (int)XFS_BMAP_BLOCK_IMAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+        case XFS_BTNUM_INO:
+                return (int)XFS_INOBT_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+        default:
+                ASSERT(0);
+                return 0;
+        }
+}
+/*
+ * External routines.
+ */
+#ifdef DEBUG
+/*
+ * Debug routine: check that block header is ok.
+ */
+void
+xfs_btree_check_block(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_block_t       *block, /* generic btree block pointer */
+        int                     level,  /* level of the btree block */
+        xfs_buf_t               *bp)    /* buffer containing block, if any */
+{
+        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+                xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
+                        bp);
+        else
+                xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
+                        bp);
+}
+/*
+ * Debug routine: check that keys are in the right order.
+ */
+void
+xfs_btree_check_key(
+        xfs_btnum_t     btnum,          /* btree identifier */
+        void            *ak1,           /* pointer to left (lower) key */
+        void            *ak2)           /* pointer to right (higher) key */
+{
+        switch (btnum) {
+        case XFS_BTNUM_BNO: {
+                xfs_alloc_key_t *k1;
+                xfs_alloc_key_t *k2;
+                k1 = ak1;
+                k2 = ak2;
+                ASSERT(INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT));
+                break;
+            }
+        case XFS_BTNUM_CNT: {
+                xfs_alloc_key_t *k1;
+                xfs_alloc_key_t *k2;
+                k1 = ak1;
+                k2 = ak2;
+                ASSERT(INT_GET(k1->ar_blockcount, ARCH_CONVERT) < INT_GET(k2->ar_blockcount, ARCH_CONVERT) ||
+                       (INT_GET(k1->ar_blockcount, ARCH_CONVERT) == INT_GET(k2->ar_blockcount, ARCH_CONVERT) &&
+                        INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT)));
+                break;
+            }
+        case XFS_BTNUM_BMAP: {
+                xfs_bmbt_key_t  *k1;
+                xfs_bmbt_key_t  *k2;
+                k1 = ak1;
+                k2 = ak2;
+                ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT));
+                break;
+            }
+        case XFS_BTNUM_INO: {
+                xfs_inobt_key_t *k1;
+                xfs_inobt_key_t *k2;
+                k1 = ak1;
+                k2 = ak2;
+                ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT));
+                break;
+            }
+        default:
+                ASSERT(0);
+        }
+}
+#endif  /* DEBUG */
+/*
+ * Checking routine: check that long form block header is ok.
+ */
+/* ARGSUSED */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_lblock_t      *block, /* btree long form block pointer */
+        int                     level,  /* level of the btree block */
+        xfs_buf_t               *bp)    /* buffer for block, if any */
+{
+        int                     lblock_ok; /* block passes checks */
+        xfs_mount_t             *mp;    /* file system mount point */
+        mp = cur->bc_mp;
+        lblock_ok =
+                INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
+                INT_GET(block->bb_level, ARCH_CONVERT) == level &&
+                INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                block->bb_leftsib &&
+                (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO ||
+                 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_leftsib, ARCH_CONVERT))) &&
+                block->bb_rightsib &&
+                (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO ||
+                 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_rightsib, ARCH_CONVERT)));
+        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
+                if (bp)
+                        xfs_buftrace("LBTREE ERROR", bp);
+                XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
+                                 mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+/*
+ * Checking routine: check that (long) pointer is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_dfsbno_t    ptr,            /* btree block disk address */
+        int             level)          /* btree block level */
+{
+        xfs_mount_t     *mp;            /* file system mount point */
+        mp = cur->bc_mp;
+        XFS_WANT_CORRUPTED_RETURN(
+                level > 0 &&
+                ptr != NULLDFSBNO &&
+                XFS_FSB_SANITY_CHECK(mp, ptr));
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Debug routine: check that records are in the right order.
+ */
+void
+xfs_btree_check_rec(
+        xfs_btnum_t     btnum,          /* btree identifier */
+        void            *ar1,           /* pointer to left (lower) record */
+        void            *ar2)           /* pointer to right (higher) record */
+{
+        switch (btnum) {
+        case XFS_BTNUM_BNO: {
+                xfs_alloc_rec_t *r1;
+                xfs_alloc_rec_t *r2;
+                r1 = ar1;
+                r2 = ar2;
+                ASSERT(INT_GET(r1->ar_startblock, ARCH_CONVERT) + INT_GET(r1->ar_blockcount, ARCH_CONVERT) <=
+                       INT_GET(r2->ar_startblock, ARCH_CONVERT));
+                break;
+            }
+        case XFS_BTNUM_CNT: {
+                xfs_alloc_rec_t *r1;
+                xfs_alloc_rec_t *r2;
+                r1 = ar1;
+                r2 = ar2;
+                ASSERT(INT_GET(r1->ar_blockcount, ARCH_CONVERT) < INT_GET(r2->ar_blockcount, ARCH_CONVERT) ||
+                       (INT_GET(r1->ar_blockcount, ARCH_CONVERT) == INT_GET(r2->ar_blockcount, ARCH_CONVERT) &&
+                        INT_GET(r1->ar_startblock, ARCH_CONVERT) < INT_GET(r2->ar_startblock, ARCH_CONVERT)));
+                break;
+            }
+        case XFS_BTNUM_BMAP: {
+                xfs_bmbt_rec_t  *r1;
+                xfs_bmbt_rec_t  *r2;
+                r1 = ar1;
+                r2 = ar2;
+                ASSERT(xfs_bmbt_disk_get_startoff(r1) +
+                       xfs_bmbt_disk_get_blockcount(r1) <=
+                       xfs_bmbt_disk_get_startoff(r2));
+                break;
+            }
+        case XFS_BTNUM_INO: {
+                xfs_inobt_rec_t *r1;
+                xfs_inobt_rec_t *r2;
+                r1 = ar1;
+                r2 = ar2;
+                ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <=
+                       INT_GET(r2->ir_startino, ARCH_CONVERT));
+                break;
+            }
+        default:
+                ASSERT(0);
+        }
+}
+#endif  /* DEBUG */
+/*
+ * Checking routine: check that block header is ok.
+ */
+/* ARGSUSED */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_sblock_t      *block, /* btree short form block pointer */
+        int                     level,  /* level of the btree block */
+        xfs_buf_t               *bp)    /* buffer containing block */
+{
+        xfs_buf_t               *agbp;  /* buffer for ag. freespace struct */
+        xfs_agf_t               *agf;   /* ag. freespace structure */
+        xfs_agblock_t           agflen; /* native ag. freespace length */
+        int                     sblock_ok; /* block passes checks */
+        agbp = cur->bc_private.a.agbp;
+        agf = XFS_BUF_TO_AGF(agbp);
+        agflen = INT_GET(agf->agf_length, ARCH_CONVERT);
+        sblock_ok =
+                INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
+                INT_GET(block->bb_level, ARCH_CONVERT) == level &&
+                INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK ||
+                 INT_GET(block->bb_leftsib, ARCH_CONVERT) < agflen) &&
+                block->bb_leftsib &&
+                (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK ||
+                 INT_GET(block->bb_rightsib, ARCH_CONVERT) < agflen) &&
+                block->bb_rightsib;
+        if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
+                if (bp)
+                        xfs_buftrace("SBTREE ERROR", bp);
+                XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
+                                 cur->bc_mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+/*
+ * Checking routine: check that (short) pointer is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_agblock_t   ptr,            /* btree block disk address */
+        int             level)          /* btree block level */
+{
+        xfs_buf_t       *agbp;          /* buffer for ag. freespace struct */
+        xfs_agf_t       *agf;           /* ag. freespace structure */
+        agbp = cur->bc_private.a.agbp;
+        agf = XFS_BUF_TO_AGF(agbp);
+        XFS_WANT_CORRUPTED_RETURN(
+                level > 0 &&
+                ptr != NULLAGBLOCK && ptr != 0 &&
+                ptr < INT_GET(agf->agf_length, ARCH_CONVERT));
+        return 0;
+}
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        int             error)          /* del because of error */
+{
+        int             i;              /* btree level */
+        /*
+         * Clear the buffer pointers, and release the buffers.
+         * If we're doing this in the face of an error, we
+         * need to make sure to inspect all of the entries
+         * in the bc_bufs array for buffers to be unlocked.
+         * This is because some of the btree code works from
+         * level n down to 0, and if we get an error along
+         * the way we won't have initialized all the entries
+         * down to 0.
+         */
+        for (i = 0; i < cur->bc_nlevels; i++) {
+                if (cur->bc_bufs[i])
+                        xfs_btree_setbuf(cur, i, NULL);
+                else if (!error)
+                        break;
+        }
+        /*
+         * Can't free a bmap cursor without having dealt with the
+         * allocated indirect blocks' accounting.
+         */
+        ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+               cur->bc_private.b.allocated == 0);
+        /*
+         * Free the cursor.
+         */
+        kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int                                     /* error */
+xfs_btree_dup_cursor(
+        xfs_btree_cur_t *cur,           /* input cursor */
+        xfs_btree_cur_t **ncur)         /* output cursor */
+{
+        xfs_buf_t       *bp;            /* btree block's buffer pointer */
+        int             error;          /* error return value */
+        int             i;              /* level number of btree block */
+        xfs_mount_t     *mp;            /* mount structure for filesystem */
+        xfs_btree_cur_t *new;           /* new cursor value */
+        xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
+        tp = cur->bc_tp;
+        mp = cur->bc_mp;
+        /*
+         * Allocate a new cursor like the old one.
+         */
+        new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
+                cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
+                cur->bc_private.b.whichfork);
+        /*
+         * Copy the record currently in the cursor.
+         */
+        new->bc_rec = cur->bc_rec;
+        /*
+         * For each level current, re-get the buffer and copy the ptr value.
+         */
+        for (i = 0; i < new->bc_nlevels; i++) {
+                new->bc_ptrs[i] = cur->bc_ptrs[i];
+                new->bc_ra[i] = cur->bc_ra[i];
+                if ((bp = cur->bc_bufs[i])) {
+                        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+                                xfs_btree_del_cursor(new, error);
+                                *ncur = NULL;
+                                return error;
+                        }
+                        new->bc_bufs[i] = bp;
+                        ASSERT(bp);
+                        ASSERT(!XFS_BUF_GETERROR(bp));
+                } else
+                        new->bc_bufs[i] = NULL;
+        }
+        /*
+         * For bmap btrees, copy the firstblock, flist, and flags values,
+         * since init cursor doesn't get them.
+         */
+        if (new->bc_btnum == XFS_BTNUM_BMAP) {
+                new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+                new->bc_private.b.flist = cur->bc_private.b.flist;
+                new->bc_private.b.flags = cur->bc_private.b.flags;
+        }
+        *ncur = new;
+        return 0;
+}
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int                                     /* success=1, failure=0 */
+xfs_btree_firstrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to change */
+{
+        xfs_btree_block_t       *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        /*
+         * Get the block pointer for this level.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        /*
+         * It's empty, there is no such record.
+         */
+        if (!block->bb_h.bb_numrecs)
+                return 0;
+        /*
+         * Set the ptr value to 1, that's the first record/key.
+         */
+        cur->bc_ptrs[level] = 1;
+        return 1;
+}
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be a bmap btree root or from a buffer.
+ */
+xfs_btree_block_t *                     /* generic btree block pointer */
+xfs_btree_get_block(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level in btree */
+        xfs_buf_t               **bpp)  /* buffer containing the block */
+{
+        xfs_btree_block_t       *block; /* return value */
+        xfs_buf_t               *bp;    /* return buffer */
+        xfs_ifork_t             *ifp;   /* inode fork pointer */
+        int                     whichfork; /* data or attr fork */
+        if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
+                whichfork = cur->bc_private.b.whichfork;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
+                block = (xfs_btree_block_t *)ifp->if_broot;
+                bp = NULL;
+        } else {
+                bp = cur->bc_bufs[level];
+                block = XFS_BUF_TO_BLOCK(bp);
+        }
+        ASSERT(block != NULL);
+        *bpp = bp;
+        return block;
+}
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *                             /* buffer for fsbno */
+xfs_btree_get_bufl(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_fsblock_t   fsbno,          /* file system block number */
+        uint            lock)           /* lock flags for get_buf */
+{
+        xfs_buf_t       *bp;            /* buffer pointer (return value) */
+        xfs_daddr_t             d;              /* real disk block address */
+        ASSERT(fsbno != NULLFSBLOCK);
+        d = XFS_FSB_TO_DADDR(mp, fsbno);
+        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+        ASSERT(bp);
+        ASSERT(!XFS_BUF_GETERROR(bp));
+        return bp;
+}
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *                             /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agblock_t   agbno,          /* allocation group block number */
+        uint            lock)           /* lock flags for get_buf */
+{
+        xfs_buf_t       *bp;            /* buffer pointer (return value) */
+        xfs_daddr_t             d;              /* real disk block address */
+        ASSERT(agno != NULLAGNUMBER);
+        ASSERT(agbno != NULLAGBLOCK);
+        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+        ASSERT(bp);
+        ASSERT(!XFS_BUF_GETERROR(bp));
+        return bp;
+}
+/*
+ * Allocate a new btree cursor.
+ * The cursor is either for allocation (A) or bmap (B) or inodes (I).
+ */
+xfs_btree_cur_t *                       /* new btree cursor */
+xfs_btree_init_cursor(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_buf_t       *agbp,          /* (A only) buffer for agf structure */
+                                        /* (I only) buffer for agi structure */
+        xfs_agnumber_t  agno,           /* (AI only) allocation group number */
+        xfs_btnum_t     btnum,          /* btree identifier */
+        xfs_inode_t     *ip,            /* (B only) inode owning the btree */
+        int             whichfork)      /* (B only) data or attr fork */
+{
+        xfs_agf_t       *agf;           /* (A) allocation group freespace */
+        xfs_agi_t       *agi;           /* (I) allocation group inodespace */
+        xfs_btree_cur_t *cur;           /* return value */
+        xfs_ifork_t     *ifp;           /* (I) inode fork pointer */
+        int             nlevels=0;      /* number of levels in the btree */
+        ASSERT(xfs_btree_cur_zone != NULL);
+        /*
+         * Allocate a new cursor.
+         */
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+        /*
+         * Deduce the number of btree levels from the arguments.
+         */
+        switch (btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                agf = XFS_BUF_TO_AGF(agbp);
+                nlevels = INT_GET(agf->agf_levels[btnum], ARCH_CONVERT);
+                break;
+        case XFS_BTNUM_BMAP:
+                ifp = XFS_IFORK_PTR(ip, whichfork);
+                nlevels = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
+                break;
+        case XFS_BTNUM_INO:
+                agi = XFS_BUF_TO_AGI(agbp);
+                nlevels = INT_GET(agi->agi_level, ARCH_CONVERT);
+                break;
+        default:
+                ASSERT(0);
+        }
+        /*
+         * Fill in the common fields.
+         */
+        cur->bc_tp = tp;
+        cur->bc_mp = mp;
+        cur->bc_nlevels = nlevels;
+        cur->bc_btnum = btnum;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
+        /*
+         * Fill in private fields.
+         */
+        switch (btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                /*
+                 * Allocation btree fields.
+                 */
+                cur->bc_private.a.agbp = agbp;
+                cur->bc_private.a.agno = agno;
+                break;
+        case XFS_BTNUM_BMAP:
+                /*
+                 * Bmap btree fields.
+                 */
+                cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+                cur->bc_private.b.ip = ip;
+                cur->bc_private.b.firstblock = NULLFSBLOCK;
+                cur->bc_private.b.flist = NULL;
+                cur->bc_private.b.allocated = 0;
+                cur->bc_private.b.flags = 0;
+                cur->bc_private.b.whichfork = whichfork;
+                break;
+        case XFS_BTNUM_INO:
+                /*
+                 * Inode allocation btree fields.
+                 */
+                cur->bc_private.i.agbp = agbp;
+                cur->bc_private.i.agno = agno;
+                break;
+        default:
+                ASSERT(0);
+        }
+        return cur;
+}
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int                                     /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to check */
+{
+        xfs_btree_block_t       *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+                return INT_GET(block->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO;
+        else
+                return INT_GET(block->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK;
+}
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+int                                     /* success=1, failure=0 */
+xfs_btree_lastrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level)  /* level to change */
+{
+        xfs_btree_block_t       *block; /* generic btree block pointer */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        /*
+         * Get the block pointer for this level.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        xfs_btree_check_block(cur, block, level, bp);
+        /*
+         * It's empty, there is no such record.
+         */
+        if (!block->bb_h.bb_numrecs)
+                return 0;
+        /*
+         * Set the ptr value to numrecs, that's the last record/key.
+         */
+        cur->bc_ptrs[level] = INT_GET(block->bb_h.bb_numrecs, ARCH_CONVERT);
+        return 1;
+}
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+        __int64_t       fields,         /* bitmask of fields */
+        const short     *offsets,       /* table of field offsets */
+        int             nbits,          /* number of bits to inspect */
+        int             *first,         /* output: first byte offset */
+        int             *last)          /* output: last byte offset */
+{
+        int             i;              /* current bit number */
+        __int64_t       imask;          /* mask for current bit number */
+        ASSERT(fields != 0);
+        /*
+         * Find the lowest bit, so the first byte offset.
+         */
+        for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+                if (imask & fields) {
+                        *first = offsets[i];
+                        break;
+                }
+        }
+        /*
+         * Find the highest bit, so the last byte offset.
+         */
+        for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+                if (imask & fields) {
+                        *last = offsets[i + 1] - 1;
+                        break;
+                }
+        }
+}
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int                                     /* error */
+xfs_btree_read_bufl(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_fsblock_t   fsbno,          /* file system block number */
+        uint            lock,           /* lock flags for read_buf */
+        xfs_buf_t       **bpp,          /* buffer for fsbno */
+        int             refval)         /* ref count value for buffer */
+{
+        xfs_buf_t       *bp;            /* return value */
+        xfs_daddr_t             d;              /* real disk block address */
+        int             error;
+        ASSERT(fsbno != NULLFSBLOCK);
+        d = XFS_FSB_TO_DADDR(mp, fsbno);
+        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+                        mp->m_bsize, lock, &bp))) {
+                return error;
+        }
+        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        if (bp != NULL) {
+                XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+        }
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Get a buffer for the block, return it read in.
+ * Short-form addressing.
+ */
+int                                     /* error */
+xfs_btree_read_bufs(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agblock_t   agbno,          /* allocation group block number */
+        uint            lock,           /* lock flags for read_buf */
+        xfs_buf_t       **bpp,          /* buffer for agno/agbno */
+        int             refval)         /* ref count value for buffer */
+{
+        xfs_buf_t       *bp;            /* return value */
+        xfs_daddr_t     d;              /* real disk block address */
+        int             error;
+        ASSERT(agno != NULLAGNUMBER);
+        ASSERT(agbno != NULLAGBLOCK);
+        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+                                        mp->m_bsize, lock, &bp))) {
+                return error;
+        }
+        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        if (bp != NULL) {
+                switch (refval) {
+                case XFS_ALLOC_BTREE_REF:
+                        XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+                        break;
+                case XFS_INO_BTREE_REF:
+                        XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
+                        break;
+                }
+        }
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_fsblock_t   fsbno,          /* file system block number */
+        xfs_extlen_t    count)          /* count of filesystem blocks */
+{
+        xfs_daddr_t             d;
+        ASSERT(fsbno != NULLFSBLOCK);
+        d = XFS_FSB_TO_DADDR(mp, fsbno);
+        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agblock_t   agbno,          /* allocation group block number */
+        xfs_extlen_t    count)          /* count of filesystem blocks */
+{
+        xfs_daddr_t             d;
+        ASSERT(agno != NULLAGNUMBER);
+        ASSERT(agbno != NULLAGBLOCK);
+        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+int
+xfs_btree_readahead_core(
+        xfs_btree_cur_t         *cur,           /* btree cursor */
+        int                     lev,            /* level in btree */
+        int                     lr)             /* left/right bits */
+{
+        xfs_alloc_block_t       *a;
+        xfs_bmbt_block_t        *b;
+        xfs_inobt_block_t       *i;
+        int                     rval = 0;
+        ASSERT(cur->bc_bufs[lev] != NULL);
+        cur->bc_ra[lev] |= lr;
+        switch (cur->bc_btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
+                if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(a->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                INT_GET(a->bb_leftsib, ARCH_CONVERT), 1);
+                        rval++;
+                }
+                if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(a->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                INT_GET(a->bb_rightsib, ARCH_CONVERT), 1);
+                        rval++;
+                }
+                break;
+        case XFS_BTNUM_BMAP:
+                b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
+                if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(b->bb_leftsib, ARCH_CONVERT) != NULLDFSBNO) {
+                        xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_leftsib, ARCH_CONVERT), 1);
+                        rval++;
+                }
+                if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(b->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+                        xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_rightsib, ARCH_CONVERT), 1);
+                        rval++;
+                }
+                break;
+        case XFS_BTNUM_INO:
+                i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
+                if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(i->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                                INT_GET(i->bb_leftsib, ARCH_CONVERT), 1);
+                        rval++;
+                }
+                if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(i->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+                                INT_GET(i->bb_rightsib, ARCH_CONVERT), 1);
+                        rval++;
+                }
+                break;
+        default:
+                ASSERT(0);
+        }
+        return rval;
+}
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     lev,    /* level in btree */
+        xfs_buf_t               *bp)    /* new buffer to set */
+{
+        xfs_btree_block_t       *b;     /* btree block */
+        xfs_buf_t               *obp;   /* old buffer pointer */
+        obp = cur->bc_bufs[lev];
+        if (obp)
+                xfs_trans_brelse(cur->bc_tp, obp);
+        cur->bc_bufs[lev] = bp;
+        cur->bc_ra[lev] = 0;
+        if (!bp)
+                return;
+        b = XFS_BUF_TO_BLOCK(bp);
+        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+                if (INT_GET(b->bb_u.l.bb_leftsib, ARCH_CONVERT) == NULLDFSBNO)
+                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+                if (INT_GET(b->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO)
+                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+        } else {
+                if (INT_GET(b->bb_u.s.bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK)
+                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+                if (INT_GET(b->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK)
+                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+        }
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
new file mode 100644
index 000000000000..93872bba41f5
--- /dev/null
+++ b/fs/xfs/xfs_btree.h
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BTREE_H__
+#define __XFS_BTREE_H__
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define XFS_LOOKUP_EQ   ((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE   ((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE   ((xfs_lookup_t)XFS_LOOKUP_GEi)
+#define XFS_BTNUM_BNO   ((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT   ((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP  ((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
+/*
+ * Short form header: space allocation btrees.
+ */
+typedef struct xfs_btree_sblock
+{
+        __uint32_t      bb_magic;       /* magic number for block type */
+        __uint16_t      bb_level;       /* 0 is a leaf */
+        __uint16_t      bb_numrecs;     /* current # of data records */
+        xfs_agblock_t   bb_leftsib;     /* left sibling block or NULLAGBLOCK */
+        xfs_agblock_t   bb_rightsib;    /* right sibling block or NULLAGBLOCK */
+} xfs_btree_sblock_t;
+/*
+ * Long form header: bmap btrees.
+ */
+typedef struct xfs_btree_lblock
+{
+        __uint32_t      bb_magic;       /* magic number for block type */
+        __uint16_t      bb_level;       /* 0 is a leaf */
+        __uint16_t      bb_numrecs;     /* current # of data records */
+        xfs_dfsbno_t    bb_leftsib;     /* left sibling block or NULLDFSBNO */
+        xfs_dfsbno_t    bb_rightsib;    /* right sibling block or NULLDFSBNO */
+} xfs_btree_lblock_t;
+/*
+ * Combined header and structure, used by common code.
+ */
+typedef struct xfs_btree_hdr
+{
+        __uint32_t      bb_magic;       /* magic number for block type */
+        __uint16_t      bb_level;       /* 0 is a leaf */
+        __uint16_t      bb_numrecs;     /* current # of data records */
+} xfs_btree_hdr_t;
+typedef struct xfs_btree_block
+{
+        xfs_btree_hdr_t bb_h;           /* header */
+        union           {
+                struct  {
+                        xfs_agblock_t   bb_leftsib;
+                        xfs_agblock_t   bb_rightsib;
+                }       s;              /* short form pointers */
+                struct  {
+                        xfs_dfsbno_t    bb_leftsib;
+                        xfs_dfsbno_t    bb_rightsib;
+                }       l;              /* long form pointers */
+        }               bb_u;           /* rest */
+} xfs_btree_block_t;
+/*
+ * For logging record fields.
+ */
+#define XFS_BB_MAGIC            0x01
+#define XFS_BB_LEVEL            0x02
+#define XFS_BB_NUMRECS          0x04
+#define XFS_BB_LEFTSIB          0x08
+#define XFS_BB_RIGHTSIB         0x10
+#define XFS_BB_NUM_BITS         5
+#define XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
+/*
+ * Boolean to select which form of xfs_btree_block_t.bb_u to use.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BTREE_LONG_PTRS)
+int xfs_btree_long_ptrs(xfs_btnum_t btnum);
+#define XFS_BTREE_LONG_PTRS(btnum)      ((btnum) == XFS_BTNUM_BMAP)
+#else
+#define XFS_BTREE_LONG_PTRS(btnum)      ((btnum) == XFS_BTNUM_BMAP)
+#endif
+/*
+ * Magic numbers for btree blocks.
+ */
+extern const __uint32_t xfs_magics[];
+/*
+ * Maximum and minimum records in a btree block.
+ * Given block size, type prefix, and leaf flag (0 or 1).
+ * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
+ * compiler warnings.
+ */
+#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)       \
+        ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
+         (((lf) * (uint)sizeof(t ## _rec_t)) + \
+          ((1 - (lf)) * \
+           ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
+#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)       \
+        (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
+/*
+ * Record, key, and pointer address calculation macros.
+ * Given block size, type prefix, block pointer, and index of requested entry
+ * (first entry numbered 1).
+ */
+#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr)      \
+        ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+         ((i) - 1) * sizeof(t ## _rec_t)))
+#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr)      \
+        ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+         ((i) - 1) * sizeof(t ## _key_t)))
+#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr)      \
+        ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+         (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+#define XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+        struct xfs_trans        *bc_tp; /* transaction we're in, if any */
+        struct xfs_mount        *bc_mp; /* file system mount struct */
+        union {
+                xfs_alloc_rec_t         a;
+                xfs_bmbt_irec_t         b;
+                xfs_inobt_rec_t         i;
+        }               bc_rec;         /* current insert/search record value */
+        struct xfs_buf  *bc_bufs[XFS_BTREE_MAXLEVELS];  /* buf ptr per level */
+        int             bc_ptrs[XFS_BTREE_MAXLEVELS];   /* key/record # */
+        __uint8_t       bc_ra[XFS_BTREE_MAXLEVELS];     /* readahead bits */
+#define XFS_BTCUR_LEFTRA        1       /* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA       2       /* right sibling has been read-ahead */
+        __uint8_t       bc_nlevels;     /* number of levels in the tree */
+        __uint8_t       bc_blocklog;    /* log2(blocksize) of btree blocks */
+        xfs_btnum_t     bc_btnum;       /* identifies which btree type */
+        union {
+                struct {                        /* needed for BNO, CNT */
+                        struct xfs_buf  *agbp;  /* agf buffer pointer */
+                        xfs_agnumber_t  agno;   /* ag number */
+                } a;
+                struct {                        /* needed for BMAP */
+                        struct xfs_inode *ip;   /* pointer to our inode */
+                        struct xfs_bmap_free *flist;    /* list to free after */
+                        xfs_fsblock_t   firstblock;     /* 1st blk allocated */
+                        int             allocated;      /* count of alloced */
+                        short           forksize;       /* fork's inode space */
+                        char            whichfork;      /* data or attr fork */
+                        char            flags;          /* flags */
+#define XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed */
+                } b;
+                struct {                        /* needed for INO */
+                        struct xfs_buf  *agbp;  /* agi buffer pointer */
+                        xfs_agnumber_t  agno;   /* ag number */
+                } i;
+        }               bc_private;     /* per-btree type data */
+} xfs_btree_cur_t;
+#define XFS_BTREE_NOERROR       0
+#define XFS_BTREE_ERROR         1
+/*
+ * Convert from buffer to btree block header.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BLOCK)
+xfs_btree_block_t *xfs_buf_to_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_BLOCK(bp)    xfs_buf_to_block(bp)
+#else
+#define XFS_BUF_TO_BLOCK(bp)    ((xfs_btree_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_LBLOCK)
+xfs_btree_lblock_t *xfs_buf_to_lblock(struct xfs_buf *bp);
+#define XFS_BUF_TO_LBLOCK(bp)   xfs_buf_to_lblock(bp)
+#else
+#define XFS_BUF_TO_LBLOCK(bp)   ((xfs_btree_lblock_t *)(XFS_BUF_PTR(bp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBLOCK)
+xfs_btree_sblock_t *xfs_buf_to_sblock(struct xfs_buf *bp);
+#define XFS_BUF_TO_SBLOCK(bp)   xfs_buf_to_sblock(bp)
+#else
+#define XFS_BUF_TO_SBLOCK(bp)   ((xfs_btree_sblock_t *)(XFS_BUF_PTR(bp)))
+#endif
+#ifdef __KERNEL__
+#ifdef DEBUG
+/*
+ * Debug routine: check that block header is ok.
+ */
+void
+xfs_btree_check_block(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_block_t       *block, /* generic btree block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp);   /* buffer containing block, if any */
+/*
+ * Debug routine: check that keys are in the right order.
+ */
+void
+xfs_btree_check_key(
+        xfs_btnum_t             btnum,  /* btree identifier */
+        void                    *ak1,   /* pointer to left (lower) key */
+        void                    *ak2);  /* pointer to right (higher) key */
+/*
+ * Debug routine: check that records are in the right order.
+ */
+void
+xfs_btree_check_rec(
+        xfs_btnum_t             btnum,  /* btree identifier */
+        void                    *ar1,   /* pointer to left (lower) record */
+        void                    *ar2);  /* pointer to right (higher) record */
+#else
+#define xfs_btree_check_block(a,b,c,d)
+#define xfs_btree_check_key(a,b,c)
+#define xfs_btree_check_rec(a,b,c)
+#endif  /* DEBUG */
+/*
+ * Checking routine: check that long form block header is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_lblock_t      *block, /* btree long form block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp);   /* buffer containing block, if any */
+/*
+ * Checking routine: check that (long) pointer is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_dfsbno_t            ptr,    /* btree block disk address */
+        int                     level); /* btree block level */
+/*
+ * Checking routine: check that short form block header is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_btree_sblock_t      *block, /* btree short form block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp);   /* buffer containing block */
+/*
+ * Checking routine: check that (short) pointer is ok.
+ */
+int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_agblock_t           ptr,    /* btree block disk address */
+        int                     level); /* btree block level */
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     error); /* del because of error */
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int                                     /* error */
+xfs_btree_dup_cursor(
+        xfs_btree_cur_t         *cur,   /* input cursor */
+        xfs_btree_cur_t         **ncur);/* output cursor */
+/*
+ * Change the cursor to point to the first record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+int                                     /* success=1, failure=0 */
+xfs_btree_firstrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level); /* level to change */
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be a bmap btree root or from a buffer.
+ */
+xfs_btree_block_t *                     /* generic btree block pointer */
+xfs_btree_get_block(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level in btree */
+        struct xfs_buf          **bpp); /* buffer containing the block */
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf *                                /* buffer for fsbno */
+xfs_btree_get_bufl(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_fsblock_t           fsbno,  /* file system block number */
+        uint                    lock);  /* lock flags for get_buf */
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf *                                /* buffer for agno/agbno */
+xfs_btree_get_bufs(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        xfs_agblock_t           agbno,  /* allocation group block number */
+        uint                    lock);  /* lock flags for get_buf */
+/*
+ * Allocate a new btree cursor.
+ * The cursor is either for allocation (A) or bmap (B).
+ */
+xfs_btree_cur_t *                       /* new btree cursor */
+xfs_btree_init_cursor(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        struct xfs_buf          *agbp,  /* (A only) buffer for agf structure */
+        xfs_agnumber_t          agno,   /* (A only) allocation group number */
+        xfs_btnum_t             btnum,  /* btree identifier */
+        struct xfs_inode        *ip,    /* (B only) inode owning the btree */
+        int                     whichfork); /* (B only) data/attr fork */
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int                                     /* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level); /* level to check */
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.  Other levels are unaffected.
+ */
+int                                     /* success=1, failure=0 */
+xfs_btree_lastrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level); /* level to change */
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+        __int64_t               fields, /* bitmask of fields */
+        const short             *offsets,/* table of field offsets */
+        int                     nbits,  /* number of bits to inspect */
+        int                     *first, /* output: first byte offset */
+        int                     *last); /* output: last byte offset */
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int                                     /* error */
+xfs_btree_read_bufl(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_fsblock_t           fsbno,  /* file system block number */
+        uint                    lock,   /* lock flags for read_buf */
+        struct xfs_buf          **bpp,  /* buffer for fsbno */
+        int                     refval);/* ref count value for buffer */
+/*
+ * Get a buffer for the block, return it read in.
+ * Short-form addressing.
+ */
+int                                     /* error */
+xfs_btree_read_bufs(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        xfs_agblock_t           agbno,  /* allocation group block number */
+        uint                    lock,   /* lock flags for read_buf */
+        struct xfs_buf          **bpp,  /* buffer for agno/agbno */
+        int                     refval);/* ref count value for buffer */
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void                                    /* error */
+xfs_btree_reada_bufl(
+        struct xfs_mount        *mp,    /* file system mount point */
+        xfs_fsblock_t           fsbno,  /* file system block number */
+        xfs_extlen_t            count); /* count of filesystem blocks */
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void                                    /* error */
+xfs_btree_reada_bufs(
+        struct xfs_mount        *mp,    /* file system mount point */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        xfs_agblock_t           agbno,  /* allocation group block number */
+        xfs_extlen_t            count); /* count of filesystem blocks */
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+int                                     /* readahead block count */
+xfs_btree_readahead_core(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     lev,    /* level in btree */
+        int                     lr);    /* left/right bits */
+static inline int                       /* readahead block count */
+xfs_btree_readahead(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     lev,    /* level in btree */
+        int                     lr)     /* left/right bits */
+{
+        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+                return 0;
+        return xfs_btree_readahead_core(cur, lev, lr);
+}
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     lev,    /* level in btree */
+        struct xfs_buf          *bp);   /* new buffer to set */
+#endif  /* __KERNEL__ */
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MIN)
+xfs_extlen_t xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b);
+#define XFS_EXTLEN_MIN(a,b)     xfs_extlen_min(a,b)
+#else
+#define XFS_EXTLEN_MIN(a,b)     \
+        ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \
+         (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MAX)
+xfs_extlen_t xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b);
+#define XFS_EXTLEN_MAX(a,b)     xfs_extlen_max(a,b)
+#else
+#define XFS_EXTLEN_MAX(a,b)     \
+        ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \
+         (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MIN)
+xfs_agblock_t xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b);
+#define XFS_AGBLOCK_MIN(a,b)    xfs_agblock_min(a,b)
+#else
+#define XFS_AGBLOCK_MIN(a,b)    \
+        ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \
+         (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MAX)
+xfs_agblock_t xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b);
+#define XFS_AGBLOCK_MAX(a,b)    xfs_agblock_max(a,b)
+#else
+#define XFS_AGBLOCK_MAX(a,b)    \
+        ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \
+         (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MIN)
+xfs_fileoff_t xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b);
+#define XFS_FILEOFF_MIN(a,b)    xfs_fileoff_min(a,b)
+#else
+#define XFS_FILEOFF_MIN(a,b)    \
+        ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \
+         (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MAX)
+xfs_fileoff_t xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b);
+#define XFS_FILEOFF_MAX(a,b)    xfs_fileoff_max(a,b)
+#else
+#define XFS_FILEOFF_MAX(a,b)    \
+        ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \
+         (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MIN)
+xfs_filblks_t xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b);
+#define XFS_FILBLKS_MIN(a,b)    xfs_filblks_min(a,b)
+#else
+#define XFS_FILBLKS_MIN(a,b)    \
+        ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \
+         (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MAX)
+xfs_filblks_t xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b);
+#define XFS_FILBLKS_MAX(a,b)    xfs_filblks_max(a,b)
+#else
+#define XFS_FILBLKS_MAX(a,b)    \
+        ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \
+         (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_SANITY_CHECK)
+int xfs_fsb_sanity_check(struct xfs_mount *mp, xfs_fsblock_t fsb);
+#define XFS_FSB_SANITY_CHECK(mp,fsb)    xfs_fsb_sanity_check(mp,fsb)
+#else
+#define XFS_FSB_SANITY_CHECK(mp,fsb)    \
+        (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+         XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+#endif
+/*
+ * Macros to set EFSCORRUPTED & return/branch.
+ */
+#define XFS_WANT_CORRUPTED_GOTO(x,l)    \
+        { \
+                int fs_is_ok = (x); \
+                ASSERT(fs_is_ok); \
+                if (unlikely(!fs_is_ok)) { \
+                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
+                                         XFS_ERRLEVEL_LOW, NULL); \
+                        error = XFS_ERROR(EFSCORRUPTED); \
+                        goto l; \
+                } \
+        }
+#define XFS_WANT_CORRUPTED_RETURN(x)    \
+        { \
+                int fs_is_ok = (x); \
+                ASSERT(fs_is_ok); \
+                if (unlikely(!fs_is_ok)) { \
+                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
+                                         XFS_ERRLEVEL_LOW, NULL); \
+                        return XFS_ERROR(EFSCORRUPTED); \
+                } \
+        }
+#endif  /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
new file mode 100644
index 000000000000..9ab0039f07df
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.c
@@ -0,0 +1,1221 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * This file contains the implementation of the xfs_buf_log_item.
+ * It contains the item operations used to manipulate the buf log
+ * items as well as utility routines used by the buffer specific
+ * transaction routines.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_rw.h"
+#include "xfs_bit.h"
+#include "xfs_error.h"
+kmem_zone_t     *xfs_buf_item_zone;
+#ifdef XFS_TRANS_DEBUG
+/*
+ * This function uses an alternate strategy for tracking the bytes
+ * that the user requests to be logged.  This can then be used
+ * in conjunction with the bli_orig array in the buf log item to
+ * catch bugs in our callers' code.
+ *
+ * We also double check the bits set in xfs_buf_item_log using a
+ * simple algorithm to check that every byte is accounted for.
+ */
+STATIC void
+xfs_buf_item_log_debug(
+        xfs_buf_log_item_t      *bip,
+        uint                    first,
+        uint                    last)
+{
+        uint    x;
+        uint    byte;
+        uint    nbytes;
+        uint    chunk_num;
+        uint    word_num;
+        uint    bit_num;
+        uint    bit_set;
+        uint    *wordp;
+        ASSERT(bip->bli_logged != NULL);
+        byte = first;
+        nbytes = last - first + 1;
+        bfset(bip->bli_logged, first, nbytes);
+        for (x = 0; x < nbytes; x++) {
+                chunk_num = byte >> XFS_BLI_SHIFT;
+                word_num = chunk_num >> BIT_TO_WORD_SHIFT;
+                bit_num = chunk_num & (NBWORD - 1);
+                wordp = &(bip->bli_format.blf_data_map[word_num]);
+                bit_set = *wordp & (1 << bit_num);
+                ASSERT(bit_set);
+                byte++;
+        }
+}
+/*
+ * This function is called when we flush something into a buffer without
+ * logging it.  This happens for things like inodes which are logged
+ * separately from the buffer.
+ */
+void
+xfs_buf_item_flush_log_debug(
+        xfs_buf_t       *bp,
+        uint            first,
+        uint            last)
+{
+        xfs_buf_log_item_t      *bip;
+        uint                    nbytes;
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+        if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
+                return;
+        }
+        ASSERT(bip->bli_logged != NULL);
+        nbytes = last - first + 1;
+        bfset(bip->bli_logged, first, nbytes);
+}
+/*
+ * This function is called to verify that our caller's have logged
+ * all the bytes that they changed.
+ *
+ * It does this by comparing the original copy of the buffer stored in
+ * the buf log item's bli_orig array to the current copy of the buffer
+ * and ensuring that all bytes which miscompare are set in the bli_logged
+ * array of the buf log item.
+ */
+STATIC void
+xfs_buf_item_log_check(
+        xfs_buf_log_item_t      *bip)
+{
+        char            *orig;
+        char            *buffer;
+        int             x;
+        xfs_buf_t       *bp;
+        ASSERT(bip->bli_orig != NULL);
+        ASSERT(bip->bli_logged != NULL);
+        bp = bip->bli_buf;
+        ASSERT(XFS_BUF_COUNT(bp) > 0);
+        ASSERT(XFS_BUF_PTR(bp) != NULL);
+        orig = bip->bli_orig;
+        buffer = XFS_BUF_PTR(bp);
+        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
+                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
+                        cmn_err(CE_PANIC,
+        "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
+                                bip, bp, orig, x);
+        }
+}
+#else
+#define         xfs_buf_item_log_debug(x,y,z)
+#define         xfs_buf_item_log_check(x)
+#endif
+STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
+STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
+/*
+ * This returns the number of log iovecs needed to log the
+ * given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure
+ * and 1 for each stretch of non-contiguous chunks to be logged.
+ * Contiguous chunks are logged in a single iovec.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing.
+ */
+uint
+xfs_buf_item_size(
+        xfs_buf_log_item_t      *bip)
+{
+        uint            nvecs;
+        int             next_bit;
+        int             last_bit;
+        xfs_buf_t       *bp;
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        if (bip->bli_flags & XFS_BLI_STALE) {
+                /*
+                 * The buffer is stale, so all we need to log
+                 * is the buf log format structure with the
+                 * cancel flag in it.
+                 */
+                xfs_buf_item_trace("SIZE STALE", bip);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                return 1;
+        }
+        bp = bip->bli_buf;
+        ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+        nvecs = 1;
+        last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+                                         bip->bli_format.blf_map_size, 0);
+        ASSERT(last_bit != -1);
+        nvecs++;
+        while (last_bit != -1) {
+                /*
+                 * This takes the bit number to start looking from and
+                 * returns the next set bit from there.  It returns -1
+                 * if there are no more bits set or the start bit is
+                 * beyond the end of the bitmap.
+                 */
+                next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+                                                 bip->bli_format.blf_map_size,
+                                                 last_bit + 1);
+                /*
+                 * If we run out of bits, leave the loop,
+                 * else if we find a new set of bits bump the number of vecs,
+                 * else keep scanning the current set of bits.
+                 */
+                if (next_bit == -1) {
+                        last_bit = -1;
+                } else if (next_bit != last_bit + 1) {
+                        last_bit = next_bit;
+                        nvecs++;
+                } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
+                           (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
+                            XFS_BLI_CHUNK)) {
+                        last_bit = next_bit;
+                        nvecs++;
+                } else {
+                        last_bit++;
+                }
+        }
+        xfs_buf_item_trace("SIZE NORM", bip);
+        return nvecs;
+}
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item.  It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+void
+xfs_buf_item_format(
+        xfs_buf_log_item_t      *bip,
+        xfs_log_iovec_t         *log_vector)
+{
+        uint            base_size;
+        uint            nvecs;
+        xfs_log_iovec_t *vecp;
+        xfs_buf_t       *bp;
+        int             first_bit;
+        int             last_bit;
+        int             next_bit;
+        uint            nbits;
+        uint            buffer_offset;
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+               (bip->bli_flags & XFS_BLI_STALE));
+        bp = bip->bli_buf;
+        ASSERT(XFS_BUF_BP_ISMAPPED(bp));
+        vecp = log_vector;
+        /*
+         * The size of the base structure is the size of the
+         * declared structure plus the space for the extra words
+         * of the bitmap.  We subtract one from the map size, because
+         * the first element of the bitmap is accounted for in the
+         * size of the base structure.
+         */
+        base_size =
+                (uint)(sizeof(xfs_buf_log_format_t) +
+                       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
+        vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
+        vecp->i_len = base_size;
+        vecp++;
+        nvecs = 1;
+        if (bip->bli_flags & XFS_BLI_STALE) {
+                /*
+                 * The buffer is stale, so all we need to log
+                 * is the buf log format structure with the
+                 * cancel flag in it.
+                 */
+                xfs_buf_item_trace("FORMAT STALE", bip);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                bip->bli_format.blf_size = nvecs;
+                return;
+        }
+        /*
+         * Fill in an iovec for each set of contiguous chunks.
+         */
+        first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+                                         bip->bli_format.blf_map_size, 0);
+        ASSERT(first_bit != -1);
+        last_bit = first_bit;
+        nbits = 1;
+        for (;;) {
+                /*
+                 * This takes the bit number to start looking from and
+                 * returns the next set bit from there.  It returns -1
+                 * if there are no more bits set or the start bit is
+                 * beyond the end of the bitmap.
+                 */
+                next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+                                                 bip->bli_format.blf_map_size,
+                                                 (uint)last_bit + 1);
+                /*
+                 * If we run out of bits fill in the last iovec and get
+                 * out of the loop.
+                 * Else if we start a new set of bits then fill in the
+                 * iovec for the series we were looking at and start
+                 * counting the bits in the new one.
+                 * Else we're still in the same set of bits so just
+                 * keep counting and scanning.
+                 */
+                if (next_bit == -1) {
+                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        nvecs++;
+                        break;
+                } else if (next_bit != last_bit + 1) {
+                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        nvecs++;
+                        vecp++;
+                        first_bit = next_bit;
+                        last_bit = next_bit;
+                        nbits = 1;
+                } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
+                           (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
+                            XFS_BLI_CHUNK)) {
+                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+/* You would think we need to bump the nvecs here too, but we do not
+ * this number is used by recovery, and it gets confused by the boundary
+ * split here
+ *                      nvecs++;
+ */
+                        vecp++;
+                        first_bit = next_bit;
+                        last_bit = next_bit;
+                        nbits = 1;
+                } else {
+                        last_bit++;
+                        nbits++;
+                }
+        }
+        bip->bli_format.blf_size = nvecs;
+        /*
+         * Check to make sure everything is consistent.
+         */
+        xfs_buf_item_trace("FORMAT NORM", bip);
+        xfs_buf_item_log_check(bip);
+}
+/*
+ * This is called to pin the buffer associated with the buf log
+ * item in memory so it cannot be written out.  Simply call bpin()
+ * on the buffer to do this.
+ */
+void
+xfs_buf_item_pin(
+        xfs_buf_log_item_t      *bip)
+{
+        xfs_buf_t       *bp;
+        bp = bip->bli_buf;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+               (bip->bli_flags & XFS_BLI_STALE));
+        xfs_buf_item_trace("PIN", bip);
+        xfs_buftrace("XFS_PIN", bp);
+        xfs_bpin(bp);
+}
+/*
+ * This is called to unpin the buffer associated with the buf log
+ * item which was previously pinned with a call to xfs_buf_item_pin().
+ * Just call bunpin() on the buffer to do this.
+ *
+ * Also drop the reference to the buf item for the current transaction.
+ * If the XFS_BLI_STALE flag is set and we are the last reference,
+ * then free up the buf log item and unlock the buffer.
+ */
+void
+xfs_buf_item_unpin(
+        xfs_buf_log_item_t      *bip,
+        int                     stale)
+{
+        xfs_mount_t     *mp;
+        xfs_buf_t       *bp;
+        int             freed;
+        SPLDECL(s);
+        bp = bip->bli_buf;
+        ASSERT(bp != NULL);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        xfs_buf_item_trace("UNPIN", bip);
+        xfs_buftrace("XFS_UNPIN", bp);
+        freed = atomic_dec_and_test(&bip->bli_refcount);
+        mp = bip->bli_item.li_mountp;
+        xfs_bunpin(bp);
+        if (freed && stale) {
+                ASSERT(bip->bli_flags & XFS_BLI_STALE);
+                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+                ASSERT(XFS_BUF_ISSTALE(bp));
+                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                xfs_buf_item_trace("UNPIN STALE", bip);
+                xfs_buftrace("XFS_UNPIN STALE", bp);
+                /*
+                 * If we get called here because of an IO error, we may
+                 * or may not have the item on the AIL. xfs_trans_delete_ail()
+                 * will take care of that situation.
+                 * xfs_trans_delete_ail() drops the AIL lock.
+                 */
+                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
+                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+                        XFS_BUF_CLR_IODONE_FUNC(bp);
+                } else {
+                        AIL_LOCK(mp,s);
+                        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
+                        xfs_buf_item_relse(bp);
+                        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
+                }
+                xfs_buf_relse(bp);
+        }
+}
+/*
+ * this is called from uncommit in the forced-shutdown path.
+ * we need to check to see if the reference count on the log item
+ * is going to drop to zero.  If so, unpin will free the log item
+ * so we need to free the item's descriptor (that points to the item)
+ * in the transaction.
+ */
+void
+xfs_buf_item_unpin_remove(
+        xfs_buf_log_item_t      *bip,
+        xfs_trans_t             *tp)
+{
+        xfs_buf_t               *bp;
+        xfs_log_item_desc_t     *lidp;
+        int                     stale = 0;
+        bp = bip->bli_buf;
+        /*
+         * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
+         */
+        if ((atomic_read(&bip->bli_refcount) == 1) &&
+            (bip->bli_flags & XFS_BLI_STALE)) {
+                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
+                xfs_buf_item_trace("UNPIN REMOVE", bip);
+                xfs_buftrace("XFS_UNPIN_REMOVE", bp);
+                /*
+                 * yes -- clear the xaction descriptor in-use flag
+                 * and free the chunk if required.  We can safely
+                 * do some work here and then call buf_item_unpin
+                 * to do the rest because if the if is true, then
+                 * we are holding the buffer locked so no one else
+                 * will be able to bump up the refcount.
+                 */
+                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
+                stale = lidp->lid_flags & XFS_LID_BUF_STALE;
+                xfs_trans_free_item(tp, lidp);
+                /*
+                 * Since the transaction no longer refers to the buffer,
+                 * the buffer should no longer refer to the transaction.
+                 */
+                XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+        }
+        xfs_buf_item_unpin(bip, stale);
+        return;
+}
+/*
+ * This is called to attempt to lock the buffer associated with this
+ * buf log item.  Don't sleep on the buffer lock.  If we can't get
+ * the lock right away, return 0.  If we can get the lock, pull the
+ * buffer from the free list, mark it busy, and return 1.
+ */
+uint
+xfs_buf_item_trylock(
+        xfs_buf_log_item_t      *bip)
+{
+        xfs_buf_t       *bp;
+        bp = bip->bli_buf;
+        if (XFS_BUF_ISPINNED(bp)) {
+                return XFS_ITEM_PINNED;
+        }
+        if (!XFS_BUF_CPSEMA(bp)) {
+                return XFS_ITEM_LOCKED;
+        }
+        /*
+         * Remove the buffer from the free list.  Only do this
+         * if it's on the free list.  Private buffers like the
+         * superblock buffer are not.
+         */
+        XFS_BUF_HOLD(bp);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
+        return XFS_ITEM_SUCCESS;
+}
+/*
+ * Release the buffer associated with the buf log item.
+ * If there is no dirty logged data associated with the
+ * buffer recorded in the buf log item, then free the
+ * buf log item and remove the reference to it in the
+ * buffer.
+ *
+ * This call ignores the recursion count.  It is only called
+ * when the buffer should REALLY be unlocked, regardless
+ * of the recursion count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then
+ * free the log item if necessary but do not unlock the buffer.
+ * This is for support of xfs_trans_bhold(). Make sure the
+ * XFS_BLI_HOLD field is cleared if we don't free the item.
+ */
+void
+xfs_buf_item_unlock(
+        xfs_buf_log_item_t      *bip)
+{
+        int             aborted;
+        xfs_buf_t       *bp;
+        uint            hold;
+        bp = bip->bli_buf;
+        xfs_buftrace("XFS_UNLOCK", bp);
+        /*
+         * Clear the buffer's association with this transaction.
+         */
+        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+        /*
+         * If this is a transaction abort, don't return early.
+         * Instead, allow the brelse to happen.
+         * Normally it would be done for stale (cancelled) buffers
+         * at unpin time, but we'll never go through the pin/unpin
+         * cycle if we abort inside commit.
+         */
+        aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+        /*
+         * If the buf item is marked stale, then don't do anything.
+         * We'll unlock the buffer and free the buf item when the
+         * buffer is unpinned for the last time.
+         */
+        if (bip->bli_flags & XFS_BLI_STALE) {
+                bip->bli_flags &= ~XFS_BLI_LOGGED;
+                xfs_buf_item_trace("UNLOCK STALE", bip);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                if (!aborted)
+                        return;
+        }
+        /*
+         * Drop the transaction's reference to the log item if
+         * it was not logged as part of the transaction.  Otherwise
+         * we'll drop the reference in xfs_buf_item_unpin() when
+         * the transaction is really through with the buffer.
+         */
+        if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
+                atomic_dec(&bip->bli_refcount);
+        } else {
+                /*
+                 * Clear the logged flag since this is per
+                 * transaction state.
+                 */
+                bip->bli_flags &= ~XFS_BLI_LOGGED;
+        }
+        /*
+         * Before possibly freeing the buf item, determine if we should
+         * release the buffer at the end of this routine.
+         */
+        hold = bip->bli_flags & XFS_BLI_HOLD;
+        xfs_buf_item_trace("UNLOCK", bip);
+        /*
+         * If the buf item isn't tracking any data, free it.
+         * Otherwise, if XFS_BLI_HOLD is set clear it.
+         */
+        if (xfs_count_bits(bip->bli_format.blf_data_map,
+                              bip->bli_format.blf_map_size, 0) == 0) {
+                xfs_buf_item_relse(bp);
+        } else if (hold) {
+                bip->bli_flags &= ~XFS_BLI_HOLD;
+        }
+        /*
+         * Release the buffer if XFS_BLI_HOLD was not set.
+         */
+        if (!hold) {
+                xfs_buf_relse(bp);
+        }
+}
+/*
+ * This is called to find out where the oldest active copy of the
+ * buf log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.
+ * We always re-log all the dirty data in a buffer, so usually the
+ * latest copy in the on disk log is the only one that matters.  For
+ * those cases we simply return the given lsn.
+ *
+ * The one exception to this is for buffers full of newly allocated
+ * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
+ * flag set, indicating that only the di_next_unlinked fields from the
+ * inodes in the buffers will be replayed during recovery.  If the
+ * original newly allocated inode images have not yet been flushed
+ * when the buffer is so relogged, then we need to make sure that we
+ * keep the old images in the 'active' portion of the log.  We do this
+ * by returning the original lsn of that transaction here rather than
+ * the current one.
+ */
+xfs_lsn_t
+xfs_buf_item_committed(
+        xfs_buf_log_item_t      *bip,
+        xfs_lsn_t               lsn)
+{
+        xfs_buf_item_trace("COMMITTED", bip);
+        if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+            (bip->bli_item.li_lsn != 0)) {
+                return bip->bli_item.li_lsn;
+        }
+        return (lsn);
+}
+/*
+ * This is called when the transaction holding the buffer is aborted.
+ * Just behave as if the transaction had been cancelled. If we're shutting down
+ * and have aborted this transaction, we'll trap this buffer when it tries to
+ * get written out.
+ */
+void
+xfs_buf_item_abort(
+        xfs_buf_log_item_t      *bip)
+{
+        xfs_buf_t       *bp;
+        bp = bip->bli_buf;
+        xfs_buftrace("XFS_ABORT", bp);
+        XFS_BUF_SUPER_STALE(bp);
+        xfs_buf_item_unlock(bip);
+        return;
+}
+/*
+ * This is called to asynchronously write the buffer associated with this
+ * buf log item out to disk. The buffer will already have been locked by
+ * a successful call to xfs_buf_item_trylock().  If the buffer still has
+ * B_DELWRI set, then get it going out to disk with a call to bawrite().
+ * If not, then just release the buffer.
+ */
+void
+xfs_buf_item_push(
+        xfs_buf_log_item_t      *bip)
+{
+        xfs_buf_t       *bp;
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        xfs_buf_item_trace("PUSH", bip);
+        bp = bip->bli_buf;
+        if (XFS_BUF_ISDELAYWRITE(bp)) {
+                xfs_bawrite(bip->bli_item.li_mountp, bp);
+        } else {
+                xfs_buf_relse(bp);
+        }
+}
+/* ARGSUSED */
+void
+xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+{
+}
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+struct xfs_item_ops xfs_buf_item_ops = {
+        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
+        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+                                        xfs_buf_item_format,
+        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
+        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+                                        xfs_buf_item_unpin_remove,
+        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
+        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
+        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_buf_item_committed,
+        .iop_push       = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
+        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_buf_item_abort,
+        .iop_pushbuf    = NULL,
+        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_buf_item_committing
+};
+/*
+ * Allocate a new buf log item to go with the given buffer.
+ * Set the buffer's b_fsprivate field to point to the new
+ * buf log item.  If there are other item's attached to the
+ * buffer (see xfs_buf_attach_iodone() below), then put the
+ * buf log item at the front.
+ */
+void
+xfs_buf_item_init(
+        xfs_buf_t       *bp,
+        xfs_mount_t     *mp)
+{
+        xfs_log_item_t          *lip;
+        xfs_buf_log_item_t      *bip;
+        int                     chunks;
+        int                     map_size;
+        /*
+         * Check to see if there is already a buf log item for
+         * this buffer.  If there is, it is guaranteed to be
+         * the first.  If we do already have one, there is
+         * nothing to do here so return.
+         */
+        if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
+                XFS_BUF_SET_FSPRIVATE3(bp, mp);
+        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                if (lip->li_type == XFS_LI_BUF) {
+                        return;
+                }
+        }
+        /*
+         * chunks is the number of XFS_BLI_CHUNK size pieces
+         * the buffer can be divided into. Make sure not to
+         * truncate any pieces.  map_size is the size of the
+         * bitmap needed to describe the chunks of the buffer.
+         */
+        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+        map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
+        bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
+                                                    KM_SLEEP);
+        bip->bli_item.li_type = XFS_LI_BUF;
+        bip->bli_item.li_ops = &xfs_buf_item_ops;
+        bip->bli_item.li_mountp = mp;
+        bip->bli_buf = bp;
+        bip->bli_format.blf_type = XFS_LI_BUF;
+        bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
+        bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
+        bip->bli_format.blf_map_size = map_size;
+#ifdef XFS_BLI_TRACE
+        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_TRANS_DEBUG
+        /*
+         * Allocate the arrays for tracking what needs to be logged
+         * and what our callers request to be logged.  bli_orig
+         * holds a copy of the original, clean buffer for comparison
+         * against, and bli_logged keeps a 1 bit flag per byte in
+         * the buffer to indicate which bytes the callers have asked
+         * to have logged.
+         */
+        bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
+        memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
+        bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
+#endif
+        /*
+         * Put the buf item into the list of items attached to the
+         * buffer at the front.
+         */
+        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+                bip->bli_item.li_bio_list =
+                                XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        }
+        XFS_BUF_SET_FSPRIVATE(bp, bip);
+}
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+        xfs_buf_log_item_t      *bip,
+        uint                    first,
+        uint                    last)
+{
+        uint            first_bit;
+        uint            last_bit;
+        uint            bits_to_set;
+        uint            bits_set;
+        uint            word_num;
+        uint            *wordp;
+        uint            bit;
+        uint            end_bit;
+        uint            mask;
+        /*
+         * Mark the item as having some dirty data for
+         * quick reference in xfs_buf_item_dirty.
+         */
+        bip->bli_flags |= XFS_BLI_DIRTY;
+        /*
+         * Convert byte offsets to bit numbers.
+         */
+        first_bit = first >> XFS_BLI_SHIFT;
+        last_bit = last >> XFS_BLI_SHIFT;
+        /*
+         * Calculate the total number of bits to be set.
+         */
+        bits_to_set = last_bit - first_bit + 1;
+        /*
+         * Get a pointer to the first word in the bitmap
+         * to set a bit in.
+         */
+        word_num = first_bit >> BIT_TO_WORD_SHIFT;
+        wordp = &(bip->bli_format.blf_data_map[word_num]);
+        /*
+         * Calculate the starting bit in the first word.
+         */
+        bit = first_bit & (uint)(NBWORD - 1);
+        /*
+         * First set any bits in the first word of our range.
+         * If it starts at bit 0 of the word, it will be
+         * set below rather than here.  That is what the variable
+         * bit tells us. The variable bits_set tracks the number
+         * of bits that have been set so far.  End_bit is the number
+         * of the last bit to be set in this word plus one.
+         */
+        if (bit) {
+                end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
+                mask = ((1 << (end_bit - bit)) - 1) << bit;
+                *wordp |= mask;
+                wordp++;
+                bits_set = end_bit - bit;
+        } else {
+                bits_set = 0;
+        }
+        /*
+         * Now set bits a whole word at a time that are between
+         * first_bit and last_bit.
+         */
+        while ((bits_to_set - bits_set) >= NBWORD) {
+                *wordp |= 0xffffffff;
+                bits_set += NBWORD;
+                wordp++;
+        }
+        /*
+         * Finally, set any bits left to be set in one last partial word.
+         */
+        end_bit = bits_to_set - bits_set;
+        if (end_bit) {
+                mask = (1 << end_bit) - 1;
+                *wordp |= mask;
+        }
+        xfs_buf_item_log_debug(bip, first, last);
+}
+/*
+ * Return 1 if the buffer has some data that has been logged (at any
+ * point, not just the current transaction) and 0 if not.
+ */
+uint
+xfs_buf_item_dirty(
+        xfs_buf_log_item_t      *bip)
+{
+        return (bip->bli_flags & XFS_BLI_DIRTY);
+}
+/*
+ * This is called when the buf log item is no longer needed.  It should
+ * free the buf log item associated with the given buffer and clear
+ * the buffer's pointer to the buf log item.  If there are no more
+ * items in the list, clear the b_iodone field of the buffer (see
+ * xfs_buf_attach_iodone() below).
+ */
+void
+xfs_buf_item_relse(
+        xfs_buf_t       *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        xfs_buftrace("XFS_RELSE", bp);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+        XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
+        if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
+            (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
+                ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0);
+                XFS_BUF_CLR_IODONE_FUNC(bp);
+        }
+#ifdef XFS_TRANS_DEBUG
+        kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
+        bip->bli_orig = NULL;
+        kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
+        bip->bli_logged = NULL;
+#endif /* XFS_TRANS_DEBUG */
+#ifdef XFS_BLI_TRACE
+        ktrace_free(bip->bli_trace);
+#endif
+        kmem_zone_free(xfs_buf_item_zone, bip);
+}
+/*
+ * Add the given log item with its callback to the list of callbacks
+ * to be called when the buffer's I/O completes.  If it is not set
+ * already, set the buffer's b_iodone() routine to be
+ * xfs_buf_iodone_callbacks() and link the log item into the list of
+ * items rooted at b_fsprivate.  Items are always added as the second
+ * entry in the list if there is a first, because the buf item code
+ * assumes that the buf log item is first.
+ */
+void
+xfs_buf_attach_iodone(
+        xfs_buf_t       *bp,
+        void            (*cb)(xfs_buf_t *, xfs_log_item_t *),
+        xfs_log_item_t  *lip)
+{
+        xfs_log_item_t  *head_lip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+        lip->li_cb = cb;
+        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+                head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                lip->li_bio_list = head_lip->li_bio_list;
+                head_lip->li_bio_list = lip;
+        } else {
+                XFS_BUF_SET_FSPRIVATE(bp, lip);
+        }
+        ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
+               (XFS_BUF_IODONE_FUNC(bp) == NULL));
+        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+}
+STATIC void
+xfs_buf_do_callbacks(
+        xfs_buf_t       *bp,
+        xfs_log_item_t  *lip)
+{
+        xfs_log_item_t  *nlip;
+        while (lip != NULL) {
+                nlip = lip->li_bio_list;
+                ASSERT(lip->li_cb != NULL);
+                /*
+                 * Clear the next pointer so we don't have any
+                 * confusion if the item is added to another buf.
+                 * Don't touch the log item after calling its
+                 * callback, because it could have freed itself.
+                 */
+                lip->li_bio_list = NULL;
+                lip->li_cb(bp, lip);
+                lip = nlip;
+        }
+}
+/*
+ * This is the iodone() function for buffers which have had callbacks
+ * attached to them by xfs_buf_attach_iodone().  It should remove each
+ * log item from the buffer's list and call the callback of each in turn.
+ * When done, the buffer's fsprivate field is set to NULL and the buffer
+ * is unlocked with a call to iodone().
+ */
+void
+xfs_buf_iodone_callbacks(
+        xfs_buf_t       *bp)
+{
+        xfs_log_item_t  *lip;
+        static ulong    lasttime;
+        static xfs_buftarg_t *lasttarg;
+        xfs_mount_t     *mp;
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        if (XFS_BUF_GETERROR(bp) != 0) {
+                /*
+                 * If we've already decided to shutdown the filesystem
+                 * because of IO errors, there's no point in giving this
+                 * a retry.
+                 */
+                mp = lip->li_mountp;
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
+                        XFS_BUF_SUPER_STALE(bp);
+                        xfs_buftrace("BUF_IODONE_CB", bp);
+                        xfs_buf_do_callbacks(bp, lip);
+                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+                        XFS_BUF_CLR_IODONE_FUNC(bp);
+                        /*
+                         * XFS_SHUT flag gets set when we go thru the
+                         * entire buffer cache and deliberately start
+                         * throwing away delayed write buffers.
+                         * Since there's no biowait done on those,
+                         * we should just brelse them.
+                         */
+                        if (XFS_BUF_ISSHUT(bp)) {
+                            XFS_BUF_UNSHUT(bp);
+                                xfs_buf_relse(bp);
+                        } else {
+                                xfs_biodone(bp);
+                        }
+                        return;
+                }
+                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
+                    (time_after(jiffies, (lasttime + 5*HZ)))) {
+                        lasttime = jiffies;
+                        prdev("XFS write error in file system meta-data "
+                              "block 0x%llx in %s",
+                              XFS_BUF_TARGET(bp),
+                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+                }
+                lasttarg = XFS_BUF_TARGET(bp);
+                if (XFS_BUF_ISASYNC(bp)) {
+                        /*
+                         * If the write was asynchronous then noone will be
+                         * looking for the error.  Clear the error state
+                         * and write the buffer out again delayed write.
+                         *
+                         * XXXsup This is OK, so long as we catch these
+                         * before we start the umount; we don't want these
+                         * DELWRI metadata bufs to be hanging around.
+                         */
+                        XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+                        if (!(XFS_BUF_ISSTALE(bp))) {
+                                XFS_BUF_DELAYWRITE(bp);
+                                XFS_BUF_DONE(bp);
+                                XFS_BUF_SET_START(bp);
+                        }
+                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
+                        xfs_buftrace("BUF_IODONE ASYNC", bp);
+                        xfs_buf_relse(bp);
+                } else {
+                        /*
+                         * If the write of the buffer was not asynchronous,
+                         * then we want to make sure to return the error
+                         * to the caller of bwrite().  Because of this we
+                         * cannot clear the B_ERROR state at this point.
+                         * Instead we install a callback function that
+                         * will be called when the buffer is released, and
+                         * that routine will clear the error state and
+                         * set the buffer to be written out again after
+                         * some delay.
+                         */
+                        /* We actually overwrite the existing b-relse
+                           function at times, but we're gonna be shutting down
+                           anyway. */
+                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
+                        XFS_BUF_DONE(bp);
+                        XFS_BUF_V_IODONESEMA(bp);
+                }
+                return;
+        }
+#ifdef XFSERRORDEBUG
+        xfs_buftrace("XFS BUFCB NOERR", bp);
+#endif
+        xfs_buf_do_callbacks(bp, lip);
+        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+        XFS_BUF_CLR_IODONE_FUNC(bp);
+        xfs_biodone(bp);
+}
+/*
+ * This is a callback routine attached to a buffer which gets an error
+ * when being written out synchronously.
+ */
+STATIC void
+xfs_buf_error_relse(
+        xfs_buf_t       *bp)
+{
+        xfs_log_item_t  *lip;
+        xfs_mount_t     *mp;
+        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+        mp = (xfs_mount_t *)lip->li_mountp;
+        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
+        XFS_BUF_STALE(bp);
+        XFS_BUF_DONE(bp);
+        XFS_BUF_UNDELAYWRITE(bp);
+        XFS_BUF_ERROR(bp,0);
+        xfs_buftrace("BUF_ERROR_RELSE", bp);
+        if (! XFS_FORCED_SHUTDOWN(mp))
+                xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+        /*
+         * We have to unpin the pinned buffers so do the
+         * callbacks.
+         */
+        xfs_buf_do_callbacks(bp, lip);
+        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+        XFS_BUF_CLR_IODONE_FUNC(bp);
+        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+        xfs_buf_relse(bp);
+}
+/*
+ * This is the iodone() function for buffers which have been
+ * logged.  It is called when they are eventually flushed out.
+ * It should remove the buf item from the AIL, and free the buf item.
+ * It is called by xfs_buf_iodone_callbacks() above which will take
+ * care of cleaning up the buffer itself.
+ */
+/* ARGSUSED */
+void
+xfs_buf_iodone(
+        xfs_buf_t               *bp,
+        xfs_buf_log_item_t      *bip)
+{
+        struct xfs_mount        *mp;
+        SPLDECL(s);
+        ASSERT(bip->bli_buf == bp);
+        mp = bip->bli_item.li_mountp;
+        /*
+         * If we are forcibly shutting down, this may well be
+         * off the AIL already. That's because we simulate the
+         * log-committed callbacks to unpin these buffers. Or we may never
+         * have put this item on AIL because of the transaction was
+         * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+         *
+         * Either way, AIL is useless if we're forcing a shutdown.
+         */
+        AIL_LOCK(mp,s);
+        /*
+         * xfs_trans_delete_ail() drops the AIL lock.
+         */
+        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
+#ifdef XFS_TRANS_DEBUG
+        kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
+        bip->bli_orig = NULL;
+        kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
+        bip->bli_logged = NULL;
+#endif /* XFS_TRANS_DEBUG */
+#ifdef XFS_BLI_TRACE
+        ktrace_free(bip->bli_trace);
+#endif
+        kmem_zone_free(xfs_buf_item_zone, bip);
+}
+#if defined(XFS_BLI_TRACE)
+void
+xfs_buf_item_trace(
+        char                    *id,
+        xfs_buf_log_item_t      *bip)
+{
+        xfs_buf_t               *bp;
+        ASSERT(bip->bli_trace != NULL);
+        bp = bip->bli_buf;
+        ktrace_enter(bip->bli_trace,
+                     (void *)id,
+                     (void *)bip->bli_buf,
+                     (void *)((unsigned long)bip->bli_flags),
+                     (void *)((unsigned long)bip->bli_recur),
+                     (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
+                     (void *)((unsigned long)
+                                (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
+                     (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
+                     (void *)((unsigned long)XFS_BUF_COUNT(bp)),
+                     (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
+                     XFS_BUF_FSPRIVATE(bp, void *),
+                     XFS_BUF_FSPRIVATE2(bp, void *),
+                     (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
+                     (void *)XFS_BUF_IODONE_FUNC(bp),
+                     (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
+                     (void *)bip->bli_item.li_desc,
+                     (void *)((unsigned long)bip->bli_item.li_flags));
+}
+#endif /* XFS_BLI_TRACE */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
new file mode 100644
index 000000000000..5f1b0c9308f6
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BUF_ITEM_H__
+#define __XFS_BUF_ITEM_H__
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.  The data map describes which 128 byte chunks of the buffer
+ * have been logged.  This structure works only on buffers that
+ * reside up to the first TB in the filesystem.  These buffers are
+ * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF.
+ */
+typedef struct xfs_buf_log_format_v1 {
+        unsigned short  blf_type;       /* buf log item type indicator */
+        unsigned short  blf_size;       /* size of this item */
+        __int32_t       blf_blkno;      /* starting blkno of this buf */
+        ushort          blf_flags;      /* misc state */
+        ushort          blf_len;        /* number of blocks in this buf */
+        unsigned int    blf_map_size;   /* size of data bitmap in words */
+        unsigned int    blf_data_map[1];/* variable size bitmap of */
+                                        /*   regions of buffer in this item */
+} xfs_buf_log_format_v1_t;
+/*
+ * This is a form of the above structure with a 64 bit blkno field.
+ * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
+ */
+typedef struct xfs_buf_log_format_t {
+        unsigned short  blf_type;       /* buf log item type indicator */
+        unsigned short  blf_size;       /* size of this item */
+        ushort          blf_flags;      /* misc state */
+        ushort          blf_len;        /* number of blocks in this buf */
+        __int64_t       blf_blkno;      /* starting blkno of this buf */
+        unsigned int    blf_map_size;   /* size of data bitmap in words */
+        unsigned int    blf_data_map[1];/* variable size bitmap of */
+                                        /*   regions of buffer in this item */
+} xfs_buf_log_format_t;
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define XFS_BLI_INODE_BUF       0x1
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define XFS_BLI_CANCEL          0x2
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define XFS_BLI_UDQUOT_BUF      0x4
+/* #define XFS_BLI_PDQUOT_BUF   0x8 */
+#define XFS_BLI_GDQUOT_BUF      0x10
+#define XFS_BLI_CHUNK           128
+#define XFS_BLI_SHIFT           7
+#define BIT_TO_WORD_SHIFT       5
+#define NBWORD                  (NBBY * sizeof(unsigned int))
+/*
+ * buf log item flags
+ */
+#define XFS_BLI_HOLD            0x01
+#define XFS_BLI_DIRTY           0x02
+#define XFS_BLI_STALE           0x04
+#define XFS_BLI_LOGGED          0x08
+#define XFS_BLI_INODE_ALLOC_BUF 0x10
+#define XFS_BLI_STALE_INODE     0x20
+#ifdef __KERNEL__
+struct xfs_buf;
+struct ktrace;
+struct xfs_mount;
+struct xfs_buf_log_item;
+#if defined(XFS_BLI_TRACE)
+#define XFS_BLI_TRACE_SIZE      32
+void    xfs_buf_item_trace(char *, struct xfs_buf_log_item *);
+#else
+#define xfs_buf_item_trace(id, bip)
+#endif
+/*
+ * This is the in core log item structure used to track information
+ * needed to log buffers.  It tracks how many times the lock has been
+ * locked, and which 128 byte chunks of the buffer are dirty.
+ */
+typedef struct xfs_buf_log_item {
+        xfs_log_item_t          bli_item;       /* common item structure */
+        struct xfs_buf          *bli_buf;       /* real buffer pointer */
+        unsigned int            bli_flags;      /* misc flags */
+        unsigned int            bli_recur;      /* lock recursion count */
+        atomic_t                bli_refcount;   /* cnt of tp refs */
+#ifdef XFS_BLI_TRACE
+        struct ktrace           *bli_trace;     /* event trace buf */
+#endif
+#ifdef XFS_TRANS_DEBUG
+        char                    *bli_orig;      /* original buffer copy */
+        char                    *bli_logged;    /* bytes logged (bitmap) */
+#endif
+        xfs_buf_log_format_t    bli_format;     /* in-log header */
+} xfs_buf_log_item_t;
+/*
+ * This structure is used during recovery to record the buf log
+ * items which have been canceled and should not be replayed.
+ */
+typedef struct xfs_buf_cancel {
+        xfs_daddr_t             bc_blkno;
+        uint                    bc_len;
+        int                     bc_refcount;
+        struct xfs_buf_cancel   *bc_next;
+} xfs_buf_cancel_t;
+void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void    xfs_buf_item_relse(struct xfs_buf *);
+void    xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+uint    xfs_buf_item_dirty(xfs_buf_log_item_t *);
+void    xfs_buf_attach_iodone(struct xfs_buf *,
+                              void(*)(struct xfs_buf *, xfs_log_item_t *),
+                              xfs_log_item_t *);
+void    xfs_buf_iodone_callbacks(struct xfs_buf *);
+void    xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
+#ifdef XFS_TRANS_DEBUG
+void
+xfs_buf_item_flush_log_debug(
+        struct xfs_buf *bp,
+        uint    first,
+        uint    last);
+#else
+#define xfs_buf_item_flush_log_debug(bp, first, last)
+#endif
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h
new file mode 100644
index 000000000000..2deac7303758
--- /dev/null
+++ b/fs/xfs/xfs_cap.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CAP_H__
+#define __XFS_CAP_H__
+/*
+ * Capabilities
+ */
+typedef __uint64_t xfs_cap_value_t;
+typedef struct xfs_cap_set {
+        xfs_cap_value_t cap_effective;  /* use in capability checks */
+        xfs_cap_value_t cap_permitted;  /* combined with file attrs */
+        xfs_cap_value_t cap_inheritable;/* pass through exec */
+} xfs_cap_set_t;
+/* On-disk XFS extended attribute names */
+#define SGI_CAP_FILE    "SGI_CAP_FILE"
+#define SGI_CAP_FILE_SIZE       (sizeof(SGI_CAP_FILE)-1)
+#define SGI_CAP_LINUX   "SGI_CAP_LINUX"
+#define SGI_CAP_LINUX_SIZE      (sizeof(SGI_CAP_LINUX)-1)
+/*
+ * For Linux, we take the bitfields directly from capability.h
+ * and no longer attempt to keep this attribute ondisk compatible
+ * with IRIX.  Since this attribute is only set on exectuables,
+ * it just doesn't make much sense to try.  We do use a different
+ * named attribute though, to avoid confusion.
+ */
+#ifdef __KERNEL__
+#ifdef CONFIG_FS_POSIX_CAP
+#include <linux/posix_cap_xattr.h>
+struct vnode;
+extern int xfs_cap_vhascap(struct vnode *);
+extern int xfs_cap_vset(struct vnode *, void *, size_t);
+extern int xfs_cap_vget(struct vnode *, void *, size_t);
+extern int xfs_cap_vremove(struct vnode *vp);
+#define _CAP_EXISTS             xfs_cap_vhascap
+#else
+#define xfs_cap_vset(v,p,sz)    (-EOPNOTSUPP)
+#define xfs_cap_vget(v,p,sz)    (-EOPNOTSUPP)
+#define xfs_cap_vremove(v)      (-EOPNOTSUPP)
+#define _CAP_EXISTS             (NULL)
+#endif
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_CAP_H__ */
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
new file mode 100644
index 000000000000..b3215ffe0be8
--- /dev/null
+++ b/fs/xfs/xfs_clnt.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CLNT_H__
+#define __XFS_CLNT_H__
+/*
+ * XFS arguments structure, constructed from the arguments we
+ * are passed via the mount system call.
+ *
+ * NOTE: The mount system call is handled differently between
+ * Linux and IRIX.  In IRIX we worked work with a binary data
+ * structure coming in across the syscall interface from user
+ * space (the mount userspace knows about each filesystem type
+ * and the set of valid options for it, and converts the users
+ * argument string into a binary structure _before_ making the
+ * system call), and the ABI issues that this implies.
+ *
+ * In Linux, we are passed a comma separated set of options;
+ * ie. a NULL terminated string of characters.  Userspace mount
+ * code does not have any knowledge of mount options expected by
+ * each filesystem type and so each filesystem parses its mount
+ * options in kernel space.
+ *
+ * For the Linux port, we kept this structure pretty much intact
+ * and use it internally (because the existing code groks it).
+ */
+struct xfs_mount_args {
+        int     flags;          /* flags -> see XFSMNT_... macros below */
+        int     logbufs;        /* Number of log buffers, -1 to default */
+        int     logbufsize;     /* Size of log buffers, -1 to default */
+        char    fsname[MAXNAMELEN+1];   /* data device name */
+        char    rtname[MAXNAMELEN+1];   /* realtime device filename */
+        char    logname[MAXNAMELEN+1];  /* journal device filename */
+        char    mtpt[MAXNAMELEN+1];     /* filesystem mount point */
+        int     sunit;          /* stripe unit (BBs) */
+        int     swidth;         /* stripe width (BBs), multiple of sunit */
+        uchar_t iosizelog;      /* log2 of the preferred I/O size */
+        int     ihashsize;      /* inode hash table size (buckets) */
+};
+/*
+ * XFS mount option flags
+ */
+#define XFSMNT_CHKLOG           0x00000001      /* check log */
+#define XFSMNT_WSYNC            0x00000002      /* safe mode nfs mount
+                                                 * compatible */
+#define XFSMNT_INO64            0x00000004      /* move inode numbers up
+                                                 * past 2^32 */
+#define XFSMNT_UQUOTA           0x00000008      /* user quota accounting */
+#define XFSMNT_PQUOTA           0x00000010      /* IRIX prj quota accounting */
+#define XFSMNT_UQUOTAENF        0x00000020      /* user quota limit
+                                                 * enforcement */
+#define XFSMNT_PQUOTAENF        0x00000040      /* IRIX project quota limit
+                                                 * enforcement */
+#define XFSMNT_NOATIME          0x00000100      /* don't modify access
+                                                 * times on reads */
+#define XFSMNT_NOALIGN          0x00000200      /* don't allocate at
+                                                 * stripe boundaries*/
+#define XFSMNT_RETERR           0x00000400      /* return error to user */
+#define XFSMNT_NORECOVERY       0x00000800      /* no recovery, implies
+                                                 * read-only mount */
+#define XFSMNT_SHARED           0x00001000      /* shared XFS mount */
+#define XFSMNT_IOSIZE           0x00002000      /* optimize for I/O size */
+#define XFSMNT_OSYNCISOSYNC     0x00004000      /* o_sync is REALLY o_sync */
+                                                /* (osyncisdsync is now default) */
+#define XFSMNT_32BITINODES      0x00200000      /* restrict inodes to 32
+                                                 * bits of address space */
+#define XFSMNT_GQUOTA           0x00400000      /* group quota accounting */
+#define XFSMNT_GQUOTAENF        0x00800000      /* group quota limit
+                                                 * enforcement */
+#define XFSMNT_NOUUID           0x01000000      /* Ignore fs uuid */
+#define XFSMNT_DMAPI            0x02000000      /* enable dmapi/xdsm */
+#define XFSMNT_NOLOGFLUSH       0x04000000      /* Don't flush for log blocks */
+#define XFSMNT_IDELETE          0x08000000      /* inode cluster delete */
+#define XFSMNT_SWALLOC          0x10000000      /* turn on stripe width
+                                                 * allocation */
+#define XFSMNT_IHASHSIZE        0x20000000      /* inode hash table size */
+#define XFSMNT_DIRSYNC          0x40000000      /* sync creat,link,unlink,rename
+                                                 * symlink,mkdir,rmdir,mknod */
+#endif  /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
new file mode 100644
index 000000000000..d7fe28866764
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.c
@@ -0,0 +1,2648 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da_root_split(xfs_da_state_t *state,
+                                            xfs_da_state_blk_t *existing_root,
+                                            xfs_da_state_blk_t *new_child);
+STATIC int xfs_da_node_split(xfs_da_state_t *state,
+                                            xfs_da_state_blk_t *existing_blk,
+                                            xfs_da_state_blk_t *split_blk,
+                                            xfs_da_state_blk_t *blk_to_add,
+                                            int treelevel,
+                                            int *result);
+STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+                                         xfs_da_state_blk_t *node_blk_1,
+                                         xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da_node_add(xfs_da_state_t *state,
+                                   xfs_da_state_blk_t *old_node_blk,
+                                   xfs_da_state_blk_t *new_node_blk);
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da_root_join(xfs_da_state_t *state,
+                                           xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+                                              xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+                                         xfs_da_state_blk_t *src_node_blk,
+                                         xfs_da_state_blk_t *dst_node_blk);
+/*
+ * Utility routines.
+ */
+STATIC uint     xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
+STATIC int      xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
+STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+                                 xfs_dabuf_t **bpp, int whichfork)
+{
+        xfs_da_intnode_t *node;
+        xfs_dabuf_t *bp;
+        int error;
+        xfs_trans_t *tp;
+        tp = args->trans;
+        error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+        if (error)
+                return(error);
+        ASSERT(bp != NULL);
+        node = bp->data;
+        node->hdr.info.forw = 0;
+        node->hdr.info.back = 0;
+        INT_SET(node->hdr.info.magic, ARCH_CONVERT, XFS_DA_NODE_MAGIC);
+        node->hdr.info.pad = 0;
+        node->hdr.count = 0;
+        INT_SET(node->hdr.level, ARCH_CONVERT, level);
+        xfs_da_log_buf(tp, bp,
+                XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+        *bpp = bp;
+        return(0);
+}
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int                                                     /* error */
+xfs_da_split(xfs_da_state_t *state)
+{
+        xfs_da_state_blk_t *oldblk, *newblk, *addblk;
+        xfs_da_intnode_t *node;
+        xfs_dabuf_t *bp;
+        int max, action, error, i;
+        /*
+         * Walk back up the tree splitting/inserting/adjusting as necessary.
+         * If we need to insert and there isn't room, split the node, then
+         * decide which fragment to insert the new block from below into.
+         * Note that we may split the root this way, but we need more fixup.
+         */
+        max = state->path.active - 1;
+        ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+        ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+               state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+        addblk = &state->path.blk[max];         /* initial dummy value */
+        for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+                oldblk = &state->path.blk[i];
+                newblk = &state->altpath.blk[i];
+                /*
+                 * If a leaf node then
+                 *     Allocate a new leaf node, then rebalance across them.
+                 * else if an intermediate node then
+                 *     We split on the last layer, must we split the node?
+                 */
+                switch (oldblk->magic) {
+                case XFS_ATTR_LEAF_MAGIC:
+#ifndef __KERNEL__
+                        return(ENOTTY);
+#else
+                        error = xfs_attr_leaf_split(state, oldblk, newblk);
+                        if ((error != 0) && (error != ENOSPC)) {
+                                return(error);  /* GROT: attr is inconsistent */
+                        }
+                        if (!error) {
+                                addblk = newblk;
+                                break;
+                        }
+                        /*
+                         * Entry wouldn't fit, split the leaf again.
+                         */
+                        state->extravalid = 1;
+                        if (state->inleaf) {
+                                state->extraafter = 0;  /* before newblk */
+                                error = xfs_attr_leaf_split(state, oldblk,
+                                                            &state->extrablk);
+                        } else {
+                                state->extraafter = 1;  /* after newblk */
+                                error = xfs_attr_leaf_split(state, newblk,
+                                                            &state->extrablk);
+                        }
+                        if (error)
+                                return(error);  /* GROT: attr inconsistent */
+                        addblk = newblk;
+                        break;
+#endif
+                case XFS_DIR_LEAF_MAGIC:
+                        ASSERT(XFS_DIR_IS_V1(state->mp));
+                        error = xfs_dir_leaf_split(state, oldblk, newblk);
+                        if ((error != 0) && (error != ENOSPC)) {
+                                return(error);  /* GROT: dir is inconsistent */
+                        }
+                        if (!error) {
+                                addblk = newblk;
+                                break;
+                        }
+                        /*
+                         * Entry wouldn't fit, split the leaf again.
+                         */
+                        state->extravalid = 1;
+                        if (state->inleaf) {
+                                state->extraafter = 0;  /* before newblk */
+                                error = xfs_dir_leaf_split(state, oldblk,
+                                                           &state->extrablk);
+                                if (error)
+                                        return(error);  /* GROT: dir incon. */
+                                addblk = newblk;
+                        } else {
+                                state->extraafter = 1;  /* after newblk */
+                                error = xfs_dir_leaf_split(state, newblk,
+                                                           &state->extrablk);
+                                if (error)
+                                        return(error);  /* GROT: dir incon. */
+                                addblk = newblk;
+                        }
+                        break;
+                case XFS_DIR2_LEAFN_MAGIC:
+                        ASSERT(XFS_DIR_IS_V2(state->mp));
+                        error = xfs_dir2_leafn_split(state, oldblk, newblk);
+                        if (error)
+                                return error;
+                        addblk = newblk;
+                        break;
+                case XFS_DA_NODE_MAGIC:
+                        error = xfs_da_node_split(state, oldblk, newblk, addblk,
+                                                         max - i, &action);
+                        xfs_da_buf_done(addblk->bp);
+                        addblk->bp = NULL;
+                        if (error)
+                                return(error);  /* GROT: dir is inconsistent */
+                        /*
+                         * Record the newly split block for the next time thru?
+                         */
+                        if (action)
+                                addblk = newblk;
+                        else
+                                addblk = NULL;
+                        break;
+                }
+                /*
+                 * Update the btree to show the new hashval for this child.
+                 */
+                xfs_da_fixhashpath(state, &state->path);
+                /*
+                 * If we won't need this block again, it's getting dropped
+                 * from the active path by the loop control, so we need
+                 * to mark it done now.
+                 */
+                if (i > 0 || !addblk)
+                        xfs_da_buf_done(oldblk->bp);
+        }
+        if (!addblk)
+                return(0);
+        /*
+         * Split the root node.
+         */
+        ASSERT(state->path.active == 0);
+        oldblk = &state->path.blk[0];
+        error = xfs_da_root_split(state, oldblk, addblk);
+        if (error) {
+                xfs_da_buf_done(oldblk->bp);
+                xfs_da_buf_done(addblk->bp);
+                addblk->bp = NULL;
+                return(error);  /* GROT: dir is inconsistent */
+        }
+        /*
+         * Update pointers to the node which used to be block 0 and
+         * just got bumped because of the addition of a new root node.
+         * There might be three blocks involved if a double split occurred,
+         * and the original block 0 could be at any position in the list.
+         */
+        node = oldblk->bp->data;
+        if (node->hdr.info.forw) {
+                if (INT_GET(node->hdr.info.forw, ARCH_CONVERT) == addblk->blkno) {
+                        bp = addblk->bp;
+                } else {
+                        ASSERT(state->extravalid);
+                        bp = state->extrablk.bp;
+                }
+                node = bp->data;
+                INT_SET(node->hdr.info.back, ARCH_CONVERT, oldblk->blkno);
+                xfs_da_log_buf(state->args->trans, bp,
+                    XFS_DA_LOGRANGE(node, &node->hdr.info,
+                    sizeof(node->hdr.info)));
+        }
+        node = oldblk->bp->data;
+        if (INT_GET(node->hdr.info.back, ARCH_CONVERT)) {
+                if (INT_GET(node->hdr.info.back, ARCH_CONVERT) == addblk->blkno) {
+                        bp = addblk->bp;
+                } else {
+                        ASSERT(state->extravalid);
+                        bp = state->extrablk.bp;
+                }
+                node = bp->data;
+                INT_SET(node->hdr.info.forw, ARCH_CONVERT, oldblk->blkno);
+                xfs_da_log_buf(state->args->trans, bp,
+                    XFS_DA_LOGRANGE(node, &node->hdr.info,
+                    sizeof(node->hdr.info)));
+        }
+        xfs_da_buf_done(oldblk->bp);
+        xfs_da_buf_done(addblk->bp);
+        addblk->bp = NULL;
+        return(0);
+}
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int                                              /* error */
+xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+                                 xfs_da_state_blk_t *blk2)
+{
+        xfs_da_intnode_t *node, *oldroot;
+        xfs_da_args_t *args;
+        xfs_dablk_t blkno;
+        xfs_dabuf_t *bp;
+        int error, size;
+        xfs_inode_t *dp;
+        xfs_trans_t *tp;
+        xfs_mount_t *mp;
+        xfs_dir2_leaf_t *leaf;
+        /*
+         * Copy the existing (incorrect) block from the root node position
+         * to a free space somewhere.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error)
+                return(error);
+        dp = args->dp;
+        tp = args->trans;
+        mp = state->mp;
+        error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+        if (error)
+                return(error);
+        ASSERT(bp != NULL);
+        node = bp->data;
+        oldroot = blk1->bp->data;
+        if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+                size = (int)((char *)&oldroot->btree[INT_GET(oldroot->hdr.count, ARCH_CONVERT)] -
+                             (char *)oldroot);
+        } else {
+                ASSERT(XFS_DIR_IS_V2(mp));
+                ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+                leaf = (xfs_dir2_leaf_t *)oldroot;
+                size = (int)((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] -
+                             (char *)leaf);
+        }
+        memcpy(node, oldroot, size);
+        xfs_da_log_buf(tp, bp, 0, size - 1);
+        xfs_da_buf_done(blk1->bp);
+        blk1->bp = bp;
+        blk1->blkno = blkno;
+        /*
+         * Set up the new root node.
+         */
+        error = xfs_da_node_create(args,
+                args->whichfork == XFS_DATA_FORK &&
+                XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
+                INT_GET(node->hdr.level, ARCH_CONVERT) + 1, &bp, args->whichfork);
+        if (error)
+                return(error);
+        node = bp->data;
+        INT_SET(node->btree[0].hashval, ARCH_CONVERT, blk1->hashval);
+        INT_SET(node->btree[0].before, ARCH_CONVERT, blk1->blkno);
+        INT_SET(node->btree[1].hashval, ARCH_CONVERT, blk2->hashval);
+        INT_SET(node->btree[1].before, ARCH_CONVERT, blk2->blkno);
+        INT_SET(node->hdr.count, ARCH_CONVERT, 2);
+#ifdef DEBUG
+        if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+                ASSERT(blk1->blkno >= mp->m_dirleafblk &&
+                       blk1->blkno < mp->m_dirfreeblk);
+                ASSERT(blk2->blkno >= mp->m_dirleafblk &&
+                       blk2->blkno < mp->m_dirfreeblk);
+        }
+#endif
+        /* Header is already logged by xfs_da_node_create */
+        xfs_da_log_buf(tp, bp,
+                XFS_DA_LOGRANGE(node, node->btree,
+                        sizeof(xfs_da_node_entry_t) * 2));
+        xfs_da_buf_done(bp);
+        return(0);
+}
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int                                              /* error */
+xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+                                 xfs_da_state_blk_t *newblk,
+                                 xfs_da_state_blk_t *addblk,
+                                 int treelevel, int *result)
+{
+        xfs_da_intnode_t *node;
+        xfs_dablk_t blkno;
+        int newcount, error;
+        int useextra;
+        node = oldblk->bp->data;
+        ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        /*
+         * With V2 the extra block is data or freespace.
+         */
+        useextra = state->extravalid && XFS_DIR_IS_V1(state->mp);
+        newcount = 1 + useextra;
+        /*
+         * Do we have to split the node?
+         */
+        if ((INT_GET(node->hdr.count, ARCH_CONVERT) + newcount) > state->node_ents) {
+                /*
+                 * Allocate a new node, add to the doubly linked chain of
+                 * nodes, then move some of our excess entries into it.
+                 */
+                error = xfs_da_grow_inode(state->args, &blkno);
+                if (error)
+                        return(error);  /* GROT: dir is inconsistent */
+                error = xfs_da_node_create(state->args, blkno, treelevel,
+                                           &newblk->bp, state->args->whichfork);
+                if (error)
+                        return(error);  /* GROT: dir is inconsistent */
+                newblk->blkno = blkno;
+                newblk->magic = XFS_DA_NODE_MAGIC;
+                xfs_da_node_rebalance(state, oldblk, newblk);
+                error = xfs_da_blk_link(state, oldblk, newblk);
+                if (error)
+                        return(error);
+                *result = 1;
+        } else {
+                *result = 0;
+        }
+        /*
+         * Insert the new entry(s) into the correct block
+         * (updating last hashval in the process).
+         *
+         * xfs_da_node_add() inserts BEFORE the given index,
+         * and as a result of using node_lookup_int() we always
+         * point to a valid entry (not after one), but a split
+         * operation always results in a new block whose hashvals
+         * FOLLOW the current block.
+         *
+         * If we had double-split op below us, then add the extra block too.
+         */
+        node = oldblk->bp->data;
+        if (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)) {
+                oldblk->index++;
+                xfs_da_node_add(state, oldblk, addblk);
+                if (useextra) {
+                        if (state->extraafter)
+                                oldblk->index++;
+                        xfs_da_node_add(state, oldblk, &state->extrablk);
+                        state->extravalid = 0;
+                }
+        } else {
+                newblk->index++;
+                xfs_da_node_add(state, newblk, addblk);
+                if (useextra) {
+                        if (state->extraafter)
+                                newblk->index++;
+                        xfs_da_node_add(state, newblk, &state->extrablk);
+                        state->extravalid = 0;
+                }
+        }
+        return(0);
+}
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+                                     xfs_da_state_blk_t *blk2)
+{
+        xfs_da_intnode_t *node1, *node2, *tmpnode;
+        xfs_da_node_entry_t *btree_s, *btree_d;
+        int count, tmp;
+        xfs_trans_t *tp;
+        node1 = blk1->bp->data;
+        node2 = blk2->bp->data;
+        /*
+         * Figure out how many entries need to move, and in which direction.
+         * Swap the nodes around if that makes it simpler.
+         */
+        if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
+            ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+             (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+              INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+                tmpnode = node1;
+                node1 = node2;
+                node2 = tmpnode;
+        }
+        ASSERT(INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        ASSERT(INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        count = (INT_GET(node1->hdr.count, ARCH_CONVERT) - INT_GET(node2->hdr.count, ARCH_CONVERT)) / 2;
+        if (count == 0)
+                return;
+        tp = state->args->trans;
+        /*
+         * Two cases: high-to-low and low-to-high.
+         */
+        if (count > 0) {
+                /*
+                 * Move elements in node2 up to make a hole.
+                 */
+                if ((tmp = INT_GET(node2->hdr.count, ARCH_CONVERT)) > 0) {
+                        tmp *= (uint)sizeof(xfs_da_node_entry_t);
+                        btree_s = &node2->btree[0];
+                        btree_d = &node2->btree[count];
+                        memmove(btree_d, btree_s, tmp);
+                }
+                /*
+                 * Move the req'd B-tree elements from high in node1 to
+                 * low in node2.
+                 */
+                INT_MOD(node2->hdr.count, ARCH_CONVERT, count);
+                tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+                btree_s = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT) - count];
+                btree_d = &node2->btree[0];
+                memcpy(btree_d, btree_s, tmp);
+                INT_MOD(node1->hdr.count, ARCH_CONVERT, -(count));
+        } else {
+                /*
+                 * Move the req'd B-tree elements from low in node2 to
+                 * high in node1.
+                 */
+                count = -count;
+                tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+                btree_s = &node2->btree[0];
+                btree_d = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT)];
+                memcpy(btree_d, btree_s, tmp);
+                INT_MOD(node1->hdr.count, ARCH_CONVERT, count);
+                xfs_da_log_buf(tp, blk1->bp,
+                        XFS_DA_LOGRANGE(node1, btree_d, tmp));
+                /*
+                 * Move elements in node2 down to fill the hole.
+                 */
+                tmp  = INT_GET(node2->hdr.count, ARCH_CONVERT) - count;
+                tmp *= (uint)sizeof(xfs_da_node_entry_t);
+                btree_s = &node2->btree[count];
+                btree_d = &node2->btree[0];
+                memmove(btree_d, btree_s, tmp);
+                INT_MOD(node2->hdr.count, ARCH_CONVERT, -(count));
+        }
+        /*
+         * Log header of node 1 and all current bits of node 2.
+         */
+        xfs_da_log_buf(tp, blk1->bp,
+                XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+        xfs_da_log_buf(tp, blk2->bp,
+                XFS_DA_LOGRANGE(node2, &node2->hdr,
+                        sizeof(node2->hdr) +
+                        sizeof(node2->btree[0]) * INT_GET(node2->hdr.count, ARCH_CONVERT)));
+        /*
+         * Record the last hashval from each block for upward propagation.
+         * (note: don't use the swapped node pointers)
+         */
+        node1 = blk1->bp->data;
+        node2 = blk2->bp->data;
+        blk1->hashval = INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+        blk2->hashval = INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+        /*
+         * Adjust the expected index for insertion.
+         */
+        if (blk1->index >= INT_GET(node1->hdr.count, ARCH_CONVERT)) {
+                blk2->index = blk1->index - INT_GET(node1->hdr.count, ARCH_CONVERT);
+                blk1->index = INT_GET(node1->hdr.count, ARCH_CONVERT) + 1;      /* make it invalid */
+        }
+}
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+                               xfs_da_state_blk_t *newblk)
+{
+        xfs_da_intnode_t *node;
+        xfs_da_node_entry_t *btree;
+        int tmp;
+        xfs_mount_t *mp;
+        node = oldblk->bp->data;
+        mp = state->mp;
+        ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        ASSERT((oldblk->index >= 0) && (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)));
+        ASSERT(newblk->blkno != 0);
+        if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+                ASSERT(newblk->blkno >= mp->m_dirleafblk &&
+                       newblk->blkno < mp->m_dirfreeblk);
+        /*
+         * We may need to make some room before we insert the new node.
+         */
+        tmp = 0;
+        btree = &node->btree[ oldblk->index ];
+        if (oldblk->index < INT_GET(node->hdr.count, ARCH_CONVERT)) {
+                tmp = (INT_GET(node->hdr.count, ARCH_CONVERT) - oldblk->index) * (uint)sizeof(*btree);
+                memmove(btree + 1, btree, tmp);
+        }
+        INT_SET(btree->hashval, ARCH_CONVERT, newblk->hashval);
+        INT_SET(btree->before, ARCH_CONVERT, newblk->blkno);
+        xfs_da_log_buf(state->args->trans, oldblk->bp,
+                XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
+        INT_MOD(node->hdr.count, ARCH_CONVERT, +1);
+        xfs_da_log_buf(state->args->trans, oldblk->bp,
+                XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+        /*
+         * Copy the last hash value from the oldblk to propagate upwards.
+         */
+        oldblk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da_join(xfs_da_state_t *state)
+{
+        xfs_da_state_blk_t *drop_blk, *save_blk;
+        int action, error;
+        action = 0;
+        drop_blk = &state->path.blk[ state->path.active-1 ];
+        save_blk = &state->altpath.blk[ state->path.active-1 ];
+        ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+        ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+               drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+        /*
+         * Walk back up the tree joining/deallocating as necessary.
+         * When we stop dropping blocks, break out.
+         */
+        for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+                 state->path.active--) {
+                /*
+                 * See if we can combine the block with a neighbor.
+                 *   (action == 0) => no options, just leave
+                 *   (action == 1) => coalesce, then unlink
+                 *   (action == 2) => block empty, unlink it
+                 */
+                switch (drop_blk->magic) {
+                case XFS_ATTR_LEAF_MAGIC:
+#ifndef __KERNEL__
+                        error = ENOTTY;
+#else
+                        error = xfs_attr_leaf_toosmall(state, &action);
+#endif
+                        if (error)
+                                return(error);
+                        if (action == 0)
+                                return(0);
+#ifdef __KERNEL__
+                        xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+#endif
+                        break;
+                case XFS_DIR_LEAF_MAGIC:
+                        ASSERT(XFS_DIR_IS_V1(state->mp));
+                        error = xfs_dir_leaf_toosmall(state, &action);
+                        if (error)
+                                return(error);
+                        if (action == 0)
+                                return(0);
+                        xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
+                        break;
+                case XFS_DIR2_LEAFN_MAGIC:
+                        ASSERT(XFS_DIR_IS_V2(state->mp));
+                        error = xfs_dir2_leafn_toosmall(state, &action);
+                        if (error)
+                                return error;
+                        if (action == 0)
+                                return 0;
+                        xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+                        break;
+                case XFS_DA_NODE_MAGIC:
+                        /*
+                         * Remove the offending node, fixup hashvals,
+                         * check for a toosmall neighbor.
+                         */
+                        xfs_da_node_remove(state, drop_blk);
+                        xfs_da_fixhashpath(state, &state->path);
+                        error = xfs_da_node_toosmall(state, &action);
+                        if (error)
+                                return(error);
+                        if (action == 0)
+                                return 0;
+                        xfs_da_node_unbalance(state, drop_blk, save_blk);
+                        break;
+                }
+                xfs_da_fixhashpath(state, &state->altpath);
+                error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+                xfs_da_state_kill_altpath(state);
+                if (error)
+                        return(error);
+                error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+                                                         drop_blk->bp);
+                drop_blk->bp = NULL;
+                if (error)
+                        return(error);
+        }
+        /*
+         * We joined all the way to the top.  If it turns out that
+         * we only have one entry in the root, make the child block
+         * the new root.
+         */
+        xfs_da_node_remove(state, drop_blk);
+        xfs_da_fixhashpath(state, &state->path);
+        error = xfs_da_root_join(state, &state->path.blk[0]);
+        return(error);
+}
+/*
+ * We have only one entry in the root.  Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+{
+        xfs_da_intnode_t *oldroot;
+        /* REFERENCED */
+        xfs_da_blkinfo_t *blkinfo;
+        xfs_da_args_t *args;
+        xfs_dablk_t child;
+        xfs_dabuf_t *bp;
+        int error;
+        args = state->args;
+        ASSERT(args != NULL);
+        ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+        oldroot = root_blk->bp->data;
+        ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        ASSERT(!oldroot->hdr.info.forw);
+        ASSERT(!oldroot->hdr.info.back);
+        /*
+         * If the root has more than one child, then don't do anything.
+         */
+        if (INT_GET(oldroot->hdr.count, ARCH_CONVERT) > 1)
+                return(0);
+        /*
+         * Read in the (only) child block, then copy those bytes into
+         * the root block's buffer and free the original child block.
+         */
+        child = INT_GET(oldroot->btree[ 0 ].before, ARCH_CONVERT);
+        ASSERT(child != 0);
+        error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+                                             args->whichfork);
+        if (error)
+                return(error);
+        ASSERT(bp != NULL);
+        blkinfo = bp->data;
+        if (INT_GET(oldroot->hdr.level, ARCH_CONVERT) == 1) {
+                ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+                       INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+        } else {
+                ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        }
+        ASSERT(!blkinfo->forw);
+        ASSERT(!blkinfo->back);
+        memcpy(root_blk->bp->data, bp->data, state->blocksize);
+        xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+        error = xfs_da_shrink_inode(args, child, bp);
+        return(error);
+}
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+{
+        xfs_da_intnode_t *node;
+        xfs_da_state_blk_t *blk;
+        xfs_da_blkinfo_t *info;
+        int count, forward, error, retval, i;
+        xfs_dablk_t blkno;
+        xfs_dabuf_t *bp;
+        /*
+         * Check for the degenerate case of the block being over 50% full.
+         * If so, it's not worth even looking to see if we might be able
+         * to coalesce with a sibling.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        info = blk->bp->data;
+        ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        node = (xfs_da_intnode_t *)info;
+        count = INT_GET(node->hdr.count, ARCH_CONVERT);
+        if (count > (state->node_ents >> 1)) {
+                *action = 0;    /* blk over 50%, don't try to join */
+                return(0);      /* blk over 50%, don't try to join */
+        }
+        /*
+         * Check for the degenerate case of the block being empty.
+         * If the block is empty, we'll simply delete it, no need to
+         * coalesce it with a sibling block.  We choose (aribtrarily)
+         * to merge with the forward block unless it is NULL.
+         */
+        if (count == 0) {
+                /*
+                 * Make altpath point to the block we want to keep and
+                 * path point to the block we want to drop (this one).
+                 */
+                forward = info->forw;
+                memcpy(&state->altpath, &state->path, sizeof(state->path));
+                error = xfs_da_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+                if (error)
+                        return(error);
+                if (retval) {
+                        *action = 0;
+                } else {
+                        *action = 2;
+                }
+                return(0);
+        }
+        /*
+         * Examine each sibling block to see if we can coalesce with
+         * at least 25% free space to spare.  We need to figure out
+         * whether to merge with the forward or the backward block.
+         * We prefer coalescing with the lower numbered sibling so as
+         * to shrink a directory over time.
+         */
+        /* start with smaller blk num */
+        forward = (INT_GET(info->forw, ARCH_CONVERT)
+                                < INT_GET(info->back, ARCH_CONVERT));
+        for (i = 0; i < 2; forward = !forward, i++) {
+                if (forward)
+                        blkno = INT_GET(info->forw, ARCH_CONVERT);
+                else
+                        blkno = INT_GET(info->back, ARCH_CONVERT);
+                if (blkno == 0)
+                        continue;
+                error = xfs_da_read_buf(state->args->trans, state->args->dp,
+                                        blkno, -1, &bp, state->args->whichfork);
+                if (error)
+                        return(error);
+                ASSERT(bp != NULL);
+                node = (xfs_da_intnode_t *)info;
+                count  = state->node_ents;
+                count -= state->node_ents >> 2;
+                count -= INT_GET(node->hdr.count, ARCH_CONVERT);
+                node = bp->data;
+                ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+                count -= INT_GET(node->hdr.count, ARCH_CONVERT);
+                xfs_da_brelse(state->args->trans, bp);
+                if (count >= 0)
+                        break;  /* fits with at least 25% to spare */
+        }
+        if (i >= 2) {
+                *action = 0;
+                return(0);
+        }
+        /*
+         * Make altpath point to the block we want to keep (the lower
+         * numbered block) and path point to the block we want to drop.
+         */
+        memcpy(&state->altpath, &state->path, sizeof(state->path));
+        if (blkno < blk->blkno) {
+                error = xfs_da_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+                if (error) {
+                        return(error);
+                }
+                if (retval) {
+                        *action = 0;
+                        return(0);
+                }
+        } else {
+                error = xfs_da_path_shift(state, &state->path, forward,
+                                                 0, &retval);
+                if (error) {
+                        return(error);
+                }
+                if (retval) {
+                        *action = 0;
+                        return(0);
+                }
+        }
+        *action = 1;
+        return(0);
+}
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+{
+        xfs_da_state_blk_t *blk;
+        xfs_da_intnode_t *node;
+        xfs_da_node_entry_t *btree;
+        xfs_dahash_t lasthash=0;
+        int level, count;
+        level = path->active-1;
+        blk = &path->blk[ level ];
+        switch (blk->magic) {
+#ifdef __KERNEL__
+        case XFS_ATTR_LEAF_MAGIC:
+                lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+                if (count == 0)
+                        return;
+                break;
+#endif
+        case XFS_DIR_LEAF_MAGIC:
+                ASSERT(XFS_DIR_IS_V1(state->mp));
+                lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
+                if (count == 0)
+                        return;
+                break;
+        case XFS_DIR2_LEAFN_MAGIC:
+                ASSERT(XFS_DIR_IS_V2(state->mp));
+                lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+                if (count == 0)
+                        return;
+                break;
+        case XFS_DA_NODE_MAGIC:
+                lasthash = xfs_da_node_lasthash(blk->bp, &count);
+                if (count == 0)
+                        return;
+                break;
+        }
+        for (blk--, level--; level >= 0; blk--, level--) {
+                node = blk->bp->data;
+                ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+                btree = &node->btree[ blk->index ];
+                if (INT_GET(btree->hashval, ARCH_CONVERT) == lasthash)
+                        break;
+                blk->hashval = lasthash;
+                INT_SET(btree->hashval, ARCH_CONVERT, lasthash);
+                xfs_da_log_buf(state->args->trans, blk->bp,
+                                  XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+                lasthash = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+        }
+}
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+{
+        xfs_da_intnode_t *node;
+        xfs_da_node_entry_t *btree;
+        int tmp;
+        node = drop_blk->bp->data;
+        ASSERT(drop_blk->index < INT_GET(node->hdr.count, ARCH_CONVERT));
+        ASSERT(drop_blk->index >= 0);
+        /*
+         * Copy over the offending entry, or just zero it out.
+         */
+        btree = &node->btree[drop_blk->index];
+        if (drop_blk->index < (INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
+                tmp  = INT_GET(node->hdr.count, ARCH_CONVERT) - drop_blk->index - 1;
+                tmp *= (uint)sizeof(xfs_da_node_entry_t);
+                memmove(btree, btree + 1, tmp);
+                xfs_da_log_buf(state->args->trans, drop_blk->bp,
+                    XFS_DA_LOGRANGE(node, btree, tmp));
+                btree = &node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ];
+        }
+        memset((char *)btree, 0, sizeof(xfs_da_node_entry_t));
+        xfs_da_log_buf(state->args->trans, drop_blk->bp,
+            XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+        INT_MOD(node->hdr.count, ARCH_CONVERT, -1);
+        xfs_da_log_buf(state->args->trans, drop_blk->bp,
+            XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+        /*
+         * Copy the last hash value from the block to propagate upwards.
+         */
+        btree--;
+        drop_blk->hashval = INT_GET(btree->hashval, ARCH_CONVERT);
+}
+/*
+ * Unbalance the btree elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+                                     xfs_da_state_blk_t *save_blk)
+{
+        xfs_da_intnode_t *drop_node, *save_node;
+        xfs_da_node_entry_t *btree;
+        int tmp;
+        xfs_trans_t *tp;
+        drop_node = drop_blk->bp->data;
+        save_node = save_blk->bp->data;
+        ASSERT(INT_GET(drop_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        ASSERT(INT_GET(save_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        tp = state->args->trans;
+        /*
+         * If the dying block has lower hashvals, then move all the
+         * elements in the remaining block up to make a hole.
+         */
+        if ((INT_GET(drop_node->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(save_node->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+            (INT_GET(drop_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+             INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))
+        {
+                btree = &save_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT) ];
+                tmp = INT_GET(save_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
+                memmove(btree, &save_node->btree[0], tmp);
+                btree = &save_node->btree[0];
+                xfs_da_log_buf(tp, save_blk->bp,
+                        XFS_DA_LOGRANGE(save_node, btree,
+                                (INT_GET(save_node->hdr.count, ARCH_CONVERT) + INT_GET(drop_node->hdr.count, ARCH_CONVERT)) *
+                                sizeof(xfs_da_node_entry_t)));
+        } else {
+                btree = &save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT) ];
+                xfs_da_log_buf(tp, save_blk->bp,
+                        XFS_DA_LOGRANGE(save_node, btree,
+                                INT_GET(drop_node->hdr.count, ARCH_CONVERT) *
+                                sizeof(xfs_da_node_entry_t)));
+        }
+        /*
+         * Move all the B-tree elements from drop_blk to save_blk.
+         */
+        tmp = INT_GET(drop_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
+        memcpy(btree, &drop_node->btree[0], tmp);
+        INT_MOD(save_node->hdr.count, ARCH_CONVERT, INT_GET(drop_node->hdr.count, ARCH_CONVERT));
+        xfs_da_log_buf(tp, save_blk->bp,
+                XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+                        sizeof(save_node->hdr)));
+        /*
+         * Save the last hashval in the remaining block for upward propagation.
+         */
+        save_blk->hashval = INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int                                                     /* error */
+xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+{
+        xfs_da_state_blk_t *blk;
+        xfs_da_blkinfo_t *curr;
+        xfs_da_intnode_t *node;
+        xfs_da_node_entry_t *btree;
+        xfs_dablk_t blkno;
+        int probe, span, max, error, retval;
+        xfs_dahash_t hashval;
+        xfs_da_args_t *args;
+        args = state->args;
+        /*
+         * Descend thru the B-tree searching each level for the right
+         * node to use, until the right hashval is found.
+         */
+        if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
+                blkno = state->mp->m_dirleafblk;
+        else
+                blkno = 0;
+        for (blk = &state->path.blk[0], state->path.active = 1;
+                         state->path.active <= XFS_DA_NODE_MAXDEPTH;
+                         blk++, state->path.active++) {
+                /*
+                 * Read the next node down in the tree.
+                 */
+                blk->blkno = blkno;
+                error = xfs_da_read_buf(args->trans, args->dp, blkno,
+                                        -1, &blk->bp, args->whichfork);
+                if (error) {
+                        blk->blkno = 0;
+                        state->path.active--;
+                        return(error);
+                }
+                curr = blk->bp->data;
+                ASSERT(INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
+                       INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+                       INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+                /*
+                 * Search an intermediate node for a match.
+                 */
+                blk->magic = INT_GET(curr->magic, ARCH_CONVERT);
+                if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+                        node = blk->bp->data;
+                        blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+                        /*
+                         * Binary search.  (note: small blocks will skip loop)
+                         */
+                        max = INT_GET(node->hdr.count, ARCH_CONVERT);
+                        probe = span = max / 2;
+                        hashval = args->hashval;
+                        for (btree = &node->btree[probe]; span > 4;
+                                   btree = &node->btree[probe]) {
+                                span /= 2;
+                                if (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)
+                                        probe += span;
+                                else if (INT_GET(btree->hashval, ARCH_CONVERT) > hashval)
+                                        probe -= span;
+                                else
+                                        break;
+                        }
+                        ASSERT((probe >= 0) && (probe < max));
+                        ASSERT((span <= 4) || (INT_GET(btree->hashval, ARCH_CONVERT) == hashval));
+                        /*
+                         * Since we may have duplicate hashval's, find the first
+                         * matching hashval in the node.
+                         */
+                        while ((probe > 0) && (INT_GET(btree->hashval, ARCH_CONVERT) >= hashval)) {
+                                btree--;
+                                probe--;
+                        }
+                        while ((probe < max) && (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)) {
+                                btree++;
+                                probe++;
+                        }
+                        /*
+                         * Pick the right block to descend on.
+                         */
+                        if (probe == max) {
+                                blk->index = max-1;
+                                blkno = INT_GET(node->btree[ max-1 ].before, ARCH_CONVERT);
+                        } else {
+                                blk->index = probe;
+                                blkno = INT_GET(btree->before, ARCH_CONVERT);
+                        }
+                }
+#ifdef __KERNEL__
+                else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
+                        blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+                        break;
+                }
+#endif
+                else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
+                        blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
+                        break;
+                }
+                else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+                        blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+                        break;
+                }
+        }
+        /*
+         * A leaf block that ends in the hashval that we are interested in
+         * (final hashval == search hashval) means that the next block may
+         * contain more entries with the same hashval, shift upward to the
+         * next leaf and keep searching.
+         */
+        for (;;) {
+                if (blk->magic == XFS_DIR_LEAF_MAGIC) {
+                        ASSERT(XFS_DIR_IS_V1(state->mp));
+                        retval = xfs_dir_leaf_lookup_int(blk->bp, args,
+                                                                  &blk->index);
+                } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+                        ASSERT(XFS_DIR_IS_V2(state->mp));
+                        retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
+                                                        &blk->index, state);
+                }
+#ifdef __KERNEL__
+                else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+                        retval = xfs_attr_leaf_lookup_int(blk->bp, args);
+                        blk->index = args->index;
+                        args->blkno = blk->blkno;
+                }
+#endif
+                if (((retval == ENOENT) || (retval == ENOATTR)) &&
+                    (blk->hashval == args->hashval)) {
+                        error = xfs_da_path_shift(state, &state->path, 1, 1,
+                                                         &retval);
+                        if (error)
+                                return(error);
+                        if (retval == 0) {
+                                continue;
+                        }
+#ifdef __KERNEL__
+                        else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+                                /* path_shift() gives ENOENT */
+                                retval = XFS_ERROR(ENOATTR);
+                        }
+#endif
+                }
+                break;
+        }
+        *result = retval;
+        return(0);
+}
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int                                                     /* error */
+xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+                               xfs_da_state_blk_t *new_blk)
+{
+        xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
+        xfs_da_args_t *args;
+        int before=0, error;
+        xfs_dabuf_t *bp;
+        /*
+         * Set up environment.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        old_info = old_blk->bp->data;
+        new_info = new_blk->bp->data;
+        ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+               old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+               old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(old_blk->magic == INT_GET(old_info->magic, ARCH_CONVERT));
+        ASSERT(new_blk->magic == INT_GET(new_info->magic, ARCH_CONVERT));
+        ASSERT(old_blk->magic == new_blk->magic);
+        switch (old_blk->magic) {
+#ifdef __KERNEL__
+        case XFS_ATTR_LEAF_MAGIC:
+                before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+                break;
+#endif
+        case XFS_DIR_LEAF_MAGIC:
+                ASSERT(XFS_DIR_IS_V1(state->mp));
+                before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
+                break;
+        case XFS_DIR2_LEAFN_MAGIC:
+                ASSERT(XFS_DIR_IS_V2(state->mp));
+                before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+                break;
+        case XFS_DA_NODE_MAGIC:
+                before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+                break;
+        }
+        /*
+         * Link blocks in appropriate order.
+         */
+        if (before) {
+                /*
+                 * Link new block in before existing block.
+                 */
+                INT_SET(new_info->forw, ARCH_CONVERT, old_blk->blkno);
+                new_info->back = old_info->back; /* INT_: direct copy */
+                if (INT_GET(old_info->back, ARCH_CONVERT)) {
+                        error = xfs_da_read_buf(args->trans, args->dp,
+                                                INT_GET(old_info->back,
+                                                        ARCH_CONVERT), -1, &bp,
+                                                args->whichfork);
+                        if (error)
+                                return(error);
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->data;
+                        ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(old_info->magic, ARCH_CONVERT));
+                        ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == old_blk->blkno);
+                        INT_SET(tmp_info->forw, ARCH_CONVERT, new_blk->blkno);
+                        xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+                        xfs_da_buf_done(bp);
+                }
+                INT_SET(old_info->back, ARCH_CONVERT, new_blk->blkno);
+        } else {
+                /*
+                 * Link new block in after existing block.
+                 */
+                new_info->forw = old_info->forw; /* INT_: direct copy */
+                INT_SET(new_info->back, ARCH_CONVERT, old_blk->blkno);
+                if (INT_GET(old_info->forw, ARCH_CONVERT)) {
+                        error = xfs_da_read_buf(args->trans, args->dp,
+                                                INT_GET(old_info->forw, ARCH_CONVERT), -1, &bp,
+                                                args->whichfork);
+                        if (error)
+                                return(error);
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->data;
+                        ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
+                                    == INT_GET(old_info->magic, ARCH_CONVERT));
+                        ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
+                                    == old_blk->blkno);
+                        INT_SET(tmp_info->back, ARCH_CONVERT, new_blk->blkno);
+                        xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+                        xfs_da_buf_done(bp);
+                }
+                INT_SET(old_info->forw, ARCH_CONVERT, new_blk->blkno);
+        }
+        xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+        xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+        return(0);
+}
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
+{
+        xfs_da_intnode_t *node1, *node2;
+        node1 = node1_bp->data;
+        node2 = node2_bp->data;
+        ASSERT((INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) &&
+               (INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC));
+        if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
+            ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) <
+              INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+             (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+              INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+                return(1);
+        }
+        return(0);
+}
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
+{
+        xfs_da_intnode_t *node;
+        node = bp->data;
+        ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+        if (count)
+                *count = INT_GET(node->hdr.count, ARCH_CONVERT);
+        if (!node->hdr.count)
+                return(0);
+        return(INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+}
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+int                                                     /* error */
+xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+                                 xfs_da_state_blk_t *save_blk)
+{
+        xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
+        xfs_da_args_t *args;
+        xfs_dabuf_t *bp;
+        int error;
+        /*
+         * Set up environment.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        save_info = save_blk->bp->data;
+        drop_info = drop_blk->bp->data;
+        ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+               save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+               save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+        ASSERT(save_blk->magic == INT_GET(save_info->magic, ARCH_CONVERT));
+        ASSERT(drop_blk->magic == INT_GET(drop_info->magic, ARCH_CONVERT));
+        ASSERT(save_blk->magic == drop_blk->magic);
+        ASSERT((INT_GET(save_info->forw, ARCH_CONVERT) == drop_blk->blkno) ||
+               (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno));
+        ASSERT((INT_GET(drop_info->forw, ARCH_CONVERT) == save_blk->blkno) ||
+               (INT_GET(drop_info->back, ARCH_CONVERT) == save_blk->blkno));
+        /*
+         * Unlink the leaf block from the doubly linked chain of leaves.
+         */
+        if (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno) {
+                save_info->back = drop_info->back; /* INT_: direct copy */
+                if (INT_GET(drop_info->back, ARCH_CONVERT)) {
+                        error = xfs_da_read_buf(args->trans, args->dp,
+                                                INT_GET(drop_info->back,
+                                                        ARCH_CONVERT), -1, &bp,
+                                                args->whichfork);
+                        if (error)
+                                return(error);
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->data;
+                        ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(save_info->magic, ARCH_CONVERT));
+                        ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == drop_blk->blkno);
+                        INT_SET(tmp_info->forw, ARCH_CONVERT, save_blk->blkno);
+                        xfs_da_log_buf(args->trans, bp, 0,
+                                                    sizeof(*tmp_info) - 1);
+                        xfs_da_buf_done(bp);
+                }
+        } else {
+                save_info->forw = drop_info->forw; /* INT_: direct copy */
+                if (INT_GET(drop_info->forw, ARCH_CONVERT)) {
+                        error = xfs_da_read_buf(args->trans, args->dp,
+                                                INT_GET(drop_info->forw, ARCH_CONVERT), -1, &bp,
+                                                args->whichfork);
+                        if (error)
+                                return(error);
+                        ASSERT(bp != NULL);
+                        tmp_info = bp->data;
+                        ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
+                                    == INT_GET(save_info->magic, ARCH_CONVERT));
+                        ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
+                                    == drop_blk->blkno);
+                        INT_SET(tmp_info->back, ARCH_CONVERT, save_blk->blkno);
+                        xfs_da_log_buf(args->trans, bp, 0,
+                                                    sizeof(*tmp_info) - 1);
+                        xfs_da_buf_done(bp);
+                }
+        }
+        xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+        return(0);
+}
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int                                                     /* error */
+xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+                                 int forward, int release, int *result)
+{
+        xfs_da_state_blk_t *blk;
+        xfs_da_blkinfo_t *info;
+        xfs_da_intnode_t *node;
+        xfs_da_args_t *args;
+        xfs_dablk_t blkno=0;
+        int level, error;
+        /*
+         * Roll up the Btree looking for the first block where our
+         * current index is not at the edge of the block.  Note that
+         * we skip the bottom layer because we want the sibling block.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        ASSERT(path != NULL);
+        ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+        level = (path->active-1) - 1;   /* skip bottom layer in path */
+        for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+                ASSERT(blk->bp != NULL);
+                node = blk->bp->data;
+                ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+                if (forward && (blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
+                        blk->index++;
+                        blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+                        break;
+                } else if (!forward && (blk->index > 0)) {
+                        blk->index--;
+                        blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+                        break;
+                }
+        }
+        if (level < 0) {
+                *result = XFS_ERROR(ENOENT);    /* we're out of our tree */
+                ASSERT(args->oknoent);
+                return(0);
+        }
+        /*
+         * Roll down the edge of the subtree until we reach the
+         * same depth we were at originally.
+         */
+        for (blk++, level++; level < path->active; blk++, level++) {
+                /*
+                 * Release the old block.
+                 * (if it's dirty, trans won't actually let go)
+                 */
+                if (release)
+                        xfs_da_brelse(args->trans, blk->bp);
+                /*
+                 * Read the next child block.
+                 */
+                blk->blkno = blkno;
+                error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
+                                                     &blk->bp, args->whichfork);
+                if (error)
+                        return(error);
+                ASSERT(blk->bp != NULL);
+                info = blk->bp->data;
+                ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
+                       INT_GET(info->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+                       INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+                blk->magic = INT_GET(info->magic, ARCH_CONVERT);
+                if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+                        node = (xfs_da_intnode_t *)info;
+                        blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+                        if (forward)
+                                blk->index = 0;
+                        else
+                                blk->index = INT_GET(node->hdr.count, ARCH_CONVERT)-1;
+                        blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+                } else {
+                        ASSERT(level == path->active-1);
+                        blk->index = 0;
+                        switch(blk->magic) {
+#ifdef __KERNEL__
+                        case XFS_ATTR_LEAF_MAGIC:
+                                blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
+                                                                      NULL);
+                                break;
+#endif
+                        case XFS_DIR_LEAF_MAGIC:
+                                ASSERT(XFS_DIR_IS_V1(state->mp));
+                                blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
+                                                                     NULL);
+                                break;
+                        case XFS_DIR2_LEAFN_MAGIC:
+                                ASSERT(XFS_DIR_IS_V2(state->mp));
+                                blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
+                                                                       NULL);
+                                break;
+                        default:
+                                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
+                                       blk->magic ==
+                                       XFS_DIRX_LEAF_MAGIC(state->mp));
+                                break;
+                        }
+                }
+        }
+        *result = 0;
+        return(0);
+}
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(uchar_t *name, int namelen)
+{
+        xfs_dahash_t hash;
+#ifdef SLOWVERSION
+        /*
+         * This is the old one-byte-at-a-time version.
+         */
+        for (hash = 0; namelen > 0; namelen--)
+                hash = *name++ ^ rol32(hash, 7);
+        return(hash);
+#else
+        /*
+         * Do four characters at a time as long as we can.
+         */
+        for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+                hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+                       (name[3] << 0) ^ rol32(hash, 7 * 4);
+        /*
+         * Now do the rest of the characters.
+         */
+        switch (namelen) {
+        case 3:
+                return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+                       rol32(hash, 7 * 3);
+        case 2:
+                return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+        case 1:
+                return (name[0] << 0) ^ rol32(hash, 7 * 1);
+        case 0:
+                return hash;
+        }
+        /* NOTREACHED */
+#endif
+        return 0; /* keep gcc happy */
+}
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
+{
+        xfs_fileoff_t bno, b;
+        xfs_bmbt_irec_t map;
+        xfs_bmbt_irec_t *mapp;
+        xfs_inode_t *dp;
+        int nmap, error, w, count, c, got, i, mapi;
+        xfs_fsize_t size;
+        xfs_trans_t *tp;
+        xfs_mount_t *mp;
+        dp = args->dp;
+        mp = dp->i_mount;
+        w = args->whichfork;
+        tp = args->trans;
+        /*
+         * For new directories adjust the file offset and block count.
+         */
+        if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
+                bno = mp->m_dirleafblk;
+                count = mp->m_dirblkfsbs;
+        } else {
+                bno = 0;
+                count = 1;
+        }
+        /*
+         * Find a spot in the file space to put the new block.
+         */
+        if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
+                return error;
+        }
+        if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+                ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
+        /*
+         * Try mapping it in one filesystem block.
+         */
+        nmap = 1;
+        ASSERT(args->firstblock != NULL);
+        if ((error = xfs_bmapi(tp, dp, bno, count,
+                        XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+                        XFS_BMAPI_CONTIG,
+                        args->firstblock, args->total, &map, &nmap,
+                        args->flist))) {
+                return error;
+        }
+        ASSERT(nmap <= 1);
+        if (nmap == 1) {
+                mapp = &map;
+                mapi = 1;
+        }
+        /*
+         * If we didn't get it and the block might work if fragmented,
+         * try without the CONTIG flag.  Loop until we get it all.
+         */
+        else if (nmap == 0 && count > 1) {
+                mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+                for (b = bno, mapi = 0; b < bno + count; ) {
+                        nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+                        c = (int)(bno + count - b);
+                        if ((error = xfs_bmapi(tp, dp, b, c,
+                                        XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
+                                        XFS_BMAPI_METADATA,
+                                        args->firstblock, args->total,
+                                        &mapp[mapi], &nmap, args->flist))) {
+                                kmem_free(mapp, sizeof(*mapp) * count);
+                                return error;
+                        }
+                        if (nmap < 1)
+                                break;
+                        mapi += nmap;
+                        b = mapp[mapi - 1].br_startoff +
+                            mapp[mapi - 1].br_blockcount;
+                }
+        } else {
+                mapi = 0;
+                mapp = NULL;
+        }
+        /*
+         * Count the blocks we got, make sure it matches the total.
+         */
+        for (i = 0, got = 0; i < mapi; i++)
+                got += mapp[i].br_blockcount;
+        if (got != count || mapp[0].br_startoff != bno ||
+            mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+            bno + count) {
+                if (mapp != &map)
+                        kmem_free(mapp, sizeof(*mapp) * count);
+                return XFS_ERROR(ENOSPC);
+        }
+        if (mapp != &map)
+                kmem_free(mapp, sizeof(*mapp) * count);
+        *new_blkno = (xfs_dablk_t)bno;
+        /*
+         * For version 1 directories, adjust the file size if it changed.
+         */
+        if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
+                ASSERT(mapi == 1);
+                if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
+                        return error;
+                size = XFS_FSB_TO_B(mp, bno);
+                if (size != dp->i_d.di_size) {
+                        dp->i_d.di_size = size;
+                        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+                }
+        }
+        return 0;
+}
+/*
+ * Ick.  We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
+                      xfs_dabuf_t **dead_bufp)
+{
+        xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
+        xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
+        xfs_fileoff_t lastoff;
+        xfs_inode_t *ip;
+        xfs_trans_t *tp;
+        xfs_mount_t *mp;
+        int error, w, entno, level, dead_level;
+        xfs_da_blkinfo_t *dead_info, *sib_info;
+        xfs_da_intnode_t *par_node, *dead_node;
+        xfs_dir_leafblock_t *dead_leaf;
+        xfs_dir2_leaf_t *dead_leaf2;
+        xfs_dahash_t dead_hash;
+        dead_buf = *dead_bufp;
+        dead_blkno = *dead_blknop;
+        tp = args->trans;
+        ip = args->dp;
+        w = args->whichfork;
+        ASSERT(w == XFS_DATA_FORK);
+        mp = ip->i_mount;
+        if (XFS_DIR_IS_V2(mp)) {
+                lastoff = mp->m_dirfreeblk;
+                error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+        } else
+                error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
+        if (error)
+                return error;
+        if (unlikely(lastoff == 0)) {
+                XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
+                                 mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /*
+         * Read the last block in the btree space.
+         */
+        last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
+        if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+                return error;
+        /*
+         * Copy the last block into the dead buffer and log it.
+         */
+        memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize);
+        xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+        dead_info = dead_buf->data;
+        /*
+         * Get values from the moved block.
+         */
+        if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
+                ASSERT(XFS_DIR_IS_V1(mp));
+                dead_leaf = (xfs_dir_leafblock_t *)dead_info;
+                dead_level = 0;
+                dead_hash =
+                        INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+        } else if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+                ASSERT(XFS_DIR_IS_V2(mp));
+                dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+                dead_level = 0;
+                dead_hash = INT_GET(dead_leaf2->ents[INT_GET(dead_leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+        } else {
+                ASSERT(INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+                dead_node = (xfs_da_intnode_t *)dead_info;
+                dead_level = INT_GET(dead_node->hdr.level, ARCH_CONVERT);
+                dead_hash = INT_GET(dead_node->btree[INT_GET(dead_node->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+        }
+        sib_buf = par_buf = NULL;
+        /*
+         * If the moved block has a left sibling, fix up the pointers.
+         */
+        if ((sib_blkno = INT_GET(dead_info->back, ARCH_CONVERT))) {
+                if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+                        goto done;
+                sib_info = sib_buf->data;
+                if (unlikely(
+                    INT_GET(sib_info->forw, ARCH_CONVERT) != last_blkno ||
+                    INT_GET(sib_info->magic, ARCH_CONVERT) != INT_GET(dead_info->magic, ARCH_CONVERT))) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = XFS_ERROR(EFSCORRUPTED);
+                        goto done;
+                }
+                INT_SET(sib_info->forw, ARCH_CONVERT, dead_blkno);
+                xfs_da_log_buf(tp, sib_buf,
+                        XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+                                        sizeof(sib_info->forw)));
+                xfs_da_buf_done(sib_buf);
+                sib_buf = NULL;
+        }
+        /*
+         * If the moved block has a right sibling, fix up the pointers.
+         */
+        if ((sib_blkno = INT_GET(dead_info->forw, ARCH_CONVERT))) {
+                if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+                        goto done;
+                sib_info = sib_buf->data;
+                if (unlikely(
+                       INT_GET(sib_info->back, ARCH_CONVERT) != last_blkno
+                    || INT_GET(sib_info->magic, ARCH_CONVERT)
+                                != INT_GET(dead_info->magic, ARCH_CONVERT))) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = XFS_ERROR(EFSCORRUPTED);
+                        goto done;
+                }
+                INT_SET(sib_info->back, ARCH_CONVERT, dead_blkno);
+                xfs_da_log_buf(tp, sib_buf,
+                        XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+                                        sizeof(sib_info->back)));
+                xfs_da_buf_done(sib_buf);
+                sib_buf = NULL;
+        }
+        par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
+        level = -1;
+        /*
+         * Walk down the tree looking for the parent of the moved block.
+         */
+        for (;;) {
+                if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+                        goto done;
+                par_node = par_buf->data;
+                if (unlikely(
+                    INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC ||
+                    (level >= 0 && level != INT_GET(par_node->hdr.level, ARCH_CONVERT) + 1))) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = XFS_ERROR(EFSCORRUPTED);
+                        goto done;
+                }
+                level = INT_GET(par_node->hdr.level, ARCH_CONVERT);
+                for (entno = 0;
+                     entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
+                     INT_GET(par_node->btree[entno].hashval, ARCH_CONVERT) < dead_hash;
+                     entno++)
+                        continue;
+                if (unlikely(entno == INT_GET(par_node->hdr.count, ARCH_CONVERT))) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = XFS_ERROR(EFSCORRUPTED);
+                        goto done;
+                }
+                par_blkno = INT_GET(par_node->btree[entno].before, ARCH_CONVERT);
+                if (level == dead_level + 1)
+                        break;
+                xfs_da_brelse(tp, par_buf);
+                par_buf = NULL;
+        }
+        /*
+         * We're in the right parent block.
+         * Look for the right entry.
+         */
+        for (;;) {
+                for (;
+                     entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
+                     INT_GET(par_node->btree[entno].before, ARCH_CONVERT) != last_blkno;
+                     entno++)
+                        continue;
+                if (entno < INT_GET(par_node->hdr.count, ARCH_CONVERT))
+                        break;
+                par_blkno = INT_GET(par_node->hdr.info.forw, ARCH_CONVERT);
+                xfs_da_brelse(tp, par_buf);
+                par_buf = NULL;
+                if (unlikely(par_blkno == 0)) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = XFS_ERROR(EFSCORRUPTED);
+                        goto done;
+                }
+                if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+                        goto done;
+                par_node = par_buf->data;
+                if (unlikely(
+                    INT_GET(par_node->hdr.level, ARCH_CONVERT) != level ||
+                    INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)) {
+                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        error = XFS_ERROR(EFSCORRUPTED);
+                        goto done;
+                }
+                entno = 0;
+        }
+        /*
+         * Update the parent entry pointing to the moved block.
+         */
+        INT_SET(par_node->btree[entno].before, ARCH_CONVERT, dead_blkno);
+        xfs_da_log_buf(tp, par_buf,
+                XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
+                                sizeof(par_node->btree[entno].before)));
+        xfs_da_buf_done(par_buf);
+        xfs_da_buf_done(dead_buf);
+        *dead_blknop = last_blkno;
+        *dead_bufp = last_buf;
+        return 0;
+done:
+        if (par_buf)
+                xfs_da_brelse(tp, par_buf);
+        if (sib_buf)
+                xfs_da_brelse(tp, sib_buf);
+        xfs_da_brelse(tp, last_buf);
+        return error;
+}
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+                    xfs_dabuf_t *dead_buf)
+{
+        xfs_inode_t *dp;
+        int done, error, w, count;
+        xfs_fileoff_t bno;
+        xfs_fsize_t size;
+        xfs_trans_t *tp;
+        xfs_mount_t *mp;
+        dp = args->dp;
+        w = args->whichfork;
+        tp = args->trans;
+        mp = dp->i_mount;
+        if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+                count = mp->m_dirblkfsbs;
+        else
+                count = 1;
+        for (;;) {
+                /*
+                 * Remove extents.  If we get ENOSPC for a dir we have to move
+                 * the last block to the place we want to kill.
+                 */
+                if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
+                                XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
+                                0, args->firstblock, args->flist,
+                                &done)) == ENOSPC) {
+                        if (w != XFS_DATA_FORK)
+                                goto done;
+                        if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
+                                        &dead_buf)))
+                                goto done;
+                } else if (error)
+                        goto done;
+                else
+                        break;
+        }
+        ASSERT(done);
+        xfs_da_binval(tp, dead_buf);
+        /*
+         * Adjust the directory size for version 1.
+         */
+        if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
+                if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
+                        return error;
+                size = XFS_FSB_TO_B(dp->i_mount, bno);
+                if (size != dp->i_d.di_size) {
+                        dp->i_d.di_size = size;
+                        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+                }
+        }
+        return 0;
+done:
+        xfs_da_binval(tp, dead_buf);
+        return error;
+}
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+        int             nmap,
+        xfs_bmbt_irec_t *mapp,
+        xfs_dablk_t     bno,
+        int             count)
+{
+        int             i;
+        xfs_fileoff_t   off;
+        for (i = 0, off = bno; i < nmap; i++) {
+                if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+                    mapp[i].br_startblock == DELAYSTARTBLOCK) {
+                        return 0;
+                }
+                if (off != mapp[i].br_startoff) {
+                        return 0;
+                }
+                off += mapp[i].br_blockcount;
+        }
+        return off == bno + count;
+}
+/*
+ * Make a dabuf.
+ * Used for get_buf, read_buf, read_bufr, and reada_buf.
+ */
+STATIC int
+xfs_da_do_buf(
+        xfs_trans_t     *trans,
+        xfs_inode_t     *dp,
+        xfs_dablk_t     bno,
+        xfs_daddr_t     *mappedbnop,
+        xfs_dabuf_t     **bpp,
+        int             whichfork,
+        int             caller,
+        inst_t          *ra)
+{
+        xfs_buf_t       *bp = NULL;
+        xfs_buf_t       **bplist;
+        int             error=0;
+        int             i;
+        xfs_bmbt_irec_t map;
+        xfs_bmbt_irec_t *mapp;
+        xfs_daddr_t     mappedbno;
+        xfs_mount_t     *mp;
+        int             nbplist=0;
+        int             nfsb;
+        int             nmap;
+        xfs_dabuf_t     *rbp;
+        mp = dp->i_mount;
+        if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+                nfsb = mp->m_dirblkfsbs;
+        else
+                nfsb = 1;
+        mappedbno = *mappedbnop;
+        /*
+         * Caller doesn't have a mapping.  -2 means don't complain
+         * if we land in a hole.
+         */
+        if (mappedbno == -1 || mappedbno == -2) {
+                /*
+                 * Optimize the one-block case.
+                 */
+                if (nfsb == 1) {
+                        xfs_fsblock_t   fsb;
+                        if ((error =
+                            xfs_bmapi_single(trans, dp, whichfork, &fsb,
+                                    (xfs_fileoff_t)bno))) {
+                                return error;
+                        }
+                        mapp = &map;
+                        if (fsb == NULLFSBLOCK) {
+                                nmap = 0;
+                        } else {
+                                map.br_startblock = fsb;
+                                map.br_startoff = (xfs_fileoff_t)bno;
+                                map.br_blockcount = 1;
+                                nmap = 1;
+                        }
+                } else {
+                        mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
+                        nmap = nfsb;
+                        if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
+                                        nfsb,
+                                        XFS_BMAPI_METADATA |
+                                                XFS_BMAPI_AFLAG(whichfork),
+                                        NULL, 0, mapp, &nmap, NULL)))
+                                goto exit0;
+                }
+        } else {
+                map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+                map.br_startoff = (xfs_fileoff_t)bno;
+                map.br_blockcount = nfsb;
+                mapp = &map;
+                nmap = 1;
+        }
+        if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
+                error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
+                if (unlikely(error == EFSCORRUPTED)) {
+                        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+                                int     i;
+                                cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
+                                        (long long)bno);
+                                cmn_err(CE_ALERT, "dir: inode %lld\n",
+                                        (long long)dp->i_ino);
+                                for (i = 0; i < nmap; i++) {
+                                        cmn_err(CE_ALERT,
+                                                "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
+                                                i,
+                                                (long long)mapp[i].br_startoff,
+                                                (long long)mapp[i].br_startblock,
+                                                (long long)mapp[i].br_blockcount,
+                                                mapp[i].br_state);
+                                }
+                        }
+                        XFS_ERROR_REPORT("xfs_da_do_buf(1)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                }
+                goto exit0;
+        }
+        if (caller != 3 && nmap > 1) {
+                bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
+                nbplist = 0;
+        } else
+                bplist = NULL;
+        /*
+         * Turn the mapping(s) into buffer(s).
+         */
+        for (i = 0; i < nmap; i++) {
+                int     nmapped;
+                mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
+                if (i == 0)
+                        *mappedbnop = mappedbno;
+                nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
+                switch (caller) {
+                case 0:
+                        bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
+                                mappedbno, nmapped, 0);
+                        error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
+                        break;
+                case 1:
+#ifndef __KERNEL__
+                case 2:
+#endif
+                        bp = NULL;
+                        error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
+                                mappedbno, nmapped, 0, &bp);
+                        break;
+#ifdef __KERNEL__
+                case 3:
+                        xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+                        error = 0;
+                        bp = NULL;
+                        break;
+#endif
+                }
+                if (error) {
+                        if (bp)
+                                xfs_trans_brelse(trans, bp);
+                        goto exit1;
+                }
+                if (!bp)
+                        continue;
+                if (caller == 1) {
+                        if (whichfork == XFS_ATTR_FORK) {
+                                XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
+                                                XFS_ATTR_BTREE_REF);
+                        } else {
+                                XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
+                                                XFS_DIR_BTREE_REF);
+                        }
+                }
+                if (bplist) {
+                        bplist[nbplist++] = bp;
+                }
+        }
+        /*
+         * Build a dabuf structure.
+         */
+        if (bplist) {
+                rbp = xfs_da_buf_make(nbplist, bplist, ra);
+        } else if (bp)
+                rbp = xfs_da_buf_make(1, &bp, ra);
+        else
+                rbp = NULL;
+        /*
+         * For read_buf, check the magic number.
+         */
+        if (caller == 1) {
+                xfs_dir2_data_t         *data;
+                xfs_dir2_free_t         *free;
+                xfs_da_blkinfo_t        *info;
+                uint                    magic, magic1;
+                info = rbp->data;
+                data = rbp->data;
+                free = rbp->data;
+                magic = INT_GET(info->magic, ARCH_CONVERT);
+                magic1 = INT_GET(data->hdr.magic, ARCH_CONVERT);
+                if (unlikely(
+                    XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
+                                   (magic != XFS_DIR_LEAF_MAGIC) &&
+                                   (magic != XFS_ATTR_LEAF_MAGIC) &&
+                                   (magic != XFS_DIR2_LEAF1_MAGIC) &&
+                                   (magic != XFS_DIR2_LEAFN_MAGIC) &&
+                                   (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
+                                   (magic1 != XFS_DIR2_DATA_MAGIC) &&
+                                   (INT_GET(free->hdr.magic, ARCH_CONVERT) != XFS_DIR2_FREE_MAGIC),
+                                mp, XFS_ERRTAG_DA_READ_BUF,
+                                XFS_RANDOM_DA_READ_BUF))) {
+                        xfs_buftrace("DA READ ERROR", rbp->bps[0]);
+                        XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
+                                             XFS_ERRLEVEL_LOW, mp, info);
+                        error = XFS_ERROR(EFSCORRUPTED);
+                        xfs_da_brelse(trans, rbp);
+                        nbplist = 0;
+                        goto exit1;
+                }
+        }
+        if (bplist) {
+                kmem_free(bplist, sizeof(*bplist) * nmap);
+        }
+        if (mapp != &map) {
+                kmem_free(mapp, sizeof(*mapp) * nfsb);
+        }
+        if (bpp)
+                *bpp = rbp;
+        return 0;
+exit1:
+        if (bplist) {
+                for (i = 0; i < nbplist; i++)
+                        xfs_trans_brelse(trans, bplist[i]);
+                kmem_free(bplist, sizeof(*bplist) * nmap);
+        }
+exit0:
+        if (mapp != &map)
+                kmem_free(mapp, sizeof(*mapp) * nfsb);
+        if (bpp)
+                *bpp = NULL;
+        return error;
+}
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+        xfs_trans_t     *trans,
+        xfs_inode_t     *dp,
+        xfs_dablk_t     bno,
+        xfs_daddr_t             mappedbno,
+        xfs_dabuf_t     **bpp,
+        int             whichfork)
+{
+        return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
+                                                 (inst_t *)__return_address);
+}
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+        xfs_trans_t     *trans,
+        xfs_inode_t     *dp,
+        xfs_dablk_t     bno,
+        xfs_daddr_t             mappedbno,
+        xfs_dabuf_t     **bpp,
+        int             whichfork)
+{
+        return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
+                (inst_t *)__return_address);
+}
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+        xfs_trans_t     *trans,
+        xfs_inode_t     *dp,
+        xfs_dablk_t     bno,
+        int             whichfork)
+{
+        xfs_daddr_t             rval;
+        rval = -1;
+        if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
+                        (inst_t *)__return_address))
+                return -1;
+        else
+                return rval;
+}
+/*
+ * Calculate the number of bits needed to hold i different values.
+ */
+uint
+xfs_da_log2_roundup(uint i)
+{
+        uint rval;
+        for (rval = 0; rval < NBBY * sizeof(i); rval++) {
+                if ((1 << rval) >= i)
+                        break;
+        }
+        return(rval);
+}
+kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
+kmem_zone_t *xfs_dabuf_zone;            /* dabuf zone */
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+        return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+}
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+        int     i;
+        for (i = 0; i < state->altpath.active; i++) {
+                if (state->altpath.blk[i].bp) {
+                        if (state->altpath.blk[i].bp != state->path.blk[i].bp)
+                                xfs_da_buf_done(state->altpath.blk[i].bp);
+                        state->altpath.blk[i].bp = NULL;
+                }
+        }
+        state->altpath.active = 0;
+}
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+        int     i;
+        xfs_da_state_kill_altpath(state);
+        for (i = 0; i < state->path.active; i++) {
+                if (state->path.blk[i].bp)
+                        xfs_da_buf_done(state->path.blk[i].bp);
+        }
+        if (state->extravalid && state->extrablk.bp)
+                xfs_da_buf_done(state->extrablk.bp);
+#ifdef DEBUG
+        memset((char *)state, 0, sizeof(*state));
+#endif /* DEBUG */
+        kmem_zone_free(xfs_da_state_zone, state);
+}
+#ifdef XFS_DABUF_DEBUG
+xfs_dabuf_t     *xfs_dabuf_global_list;
+lock_t          xfs_dabuf_global_lock;
+#endif
+/*
+ * Create a dabuf.
+ */
+/* ARGSUSED */
+STATIC xfs_dabuf_t *
+xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
+{
+        xfs_buf_t       *bp;
+        xfs_dabuf_t     *dabuf;
+        int             i;
+        int             off;
+        if (nbuf == 1)
+                dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+        else
+                dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+        dabuf->dirty = 0;
+#ifdef XFS_DABUF_DEBUG
+        dabuf->ra = ra;
+        dabuf->target = XFS_BUF_TARGET(bps[0]);
+        dabuf->blkno = XFS_BUF_ADDR(bps[0]);
+#endif
+        if (nbuf == 1) {
+                dabuf->nbuf = 1;
+                bp = bps[0];
+                dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
+                dabuf->data = XFS_BUF_PTR(bp);
+                dabuf->bps[0] = bp;
+        } else {
+                dabuf->nbuf = nbuf;
+                for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
+                        dabuf->bps[i] = bp = bps[i];
+                        dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
+                }
+                dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
+                for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+                        bp = bps[i];
+                        memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
+                                XFS_BUF_COUNT(bp));
+                }
+        }
+#ifdef XFS_DABUF_DEBUG
+        {
+                SPLDECL(s);
+                xfs_dabuf_t     *p;
+                s = mutex_spinlock(&xfs_dabuf_global_lock);
+                for (p = xfs_dabuf_global_list; p; p = p->next) {
+                        ASSERT(p->blkno != dabuf->blkno ||
+                               p->target != dabuf->target);
+                }
+                dabuf->prev = NULL;
+                if (xfs_dabuf_global_list)
+                        xfs_dabuf_global_list->prev = dabuf;
+                dabuf->next = xfs_dabuf_global_list;
+                xfs_dabuf_global_list = dabuf;
+                mutex_spinunlock(&xfs_dabuf_global_lock, s);
+        }
+#endif
+        return dabuf;
+}
+/*
+ * Un-dirty a dabuf.
+ */
+STATIC void
+xfs_da_buf_clean(xfs_dabuf_t *dabuf)
+{
+        xfs_buf_t       *bp;
+        int             i;
+        int             off;
+        if (dabuf->dirty) {
+                ASSERT(dabuf->nbuf > 1);
+                dabuf->dirty = 0;
+                for (i = off = 0; i < dabuf->nbuf;
+                                i++, off += XFS_BUF_COUNT(bp)) {
+                        bp = dabuf->bps[i];
+                        memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
+                                XFS_BUF_COUNT(bp));
+                }
+        }
+}
+/*
+ * Release a dabuf.
+ */
+void
+xfs_da_buf_done(xfs_dabuf_t *dabuf)
+{
+        ASSERT(dabuf);
+        ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+        if (dabuf->dirty)
+                xfs_da_buf_clean(dabuf);
+        if (dabuf->nbuf > 1)
+                kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
+#ifdef XFS_DABUF_DEBUG
+        {
+                SPLDECL(s);
+                s = mutex_spinlock(&xfs_dabuf_global_lock);
+                if (dabuf->prev)
+                        dabuf->prev->next = dabuf->next;
+                else
+                        xfs_dabuf_global_list = dabuf->next;
+                if (dabuf->next)
+                        dabuf->next->prev = dabuf->prev;
+                mutex_spinunlock(&xfs_dabuf_global_lock, s);
+        }
+        memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf));
+#endif
+        if (dabuf->nbuf == 1)
+                kmem_zone_free(xfs_dabuf_zone, dabuf);
+        else
+                kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+}
+/*
+ * Log transaction from a dabuf.
+ */
+void
+xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
+{
+        xfs_buf_t       *bp;
+        uint            f;
+        int             i;
+        uint            l;
+        int             off;
+        ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+        if (dabuf->nbuf == 1) {
+                ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
+                xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
+                return;
+        }
+        dabuf->dirty = 1;
+        ASSERT(first <= last);
+        for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+                bp = dabuf->bps[i];
+                f = off;
+                l = f + XFS_BUF_COUNT(bp) - 1;
+                if (f < first)
+                        f = first;
+                if (l > last)
+                        l = last;
+                if (f <= l)
+                        xfs_trans_log_buf(tp, bp, f - off, l - off);
+                /*
+                 * B_DONE is set by xfs_trans_log buf.
+                 * If we don't set it on a new buffer (get not read)
+                 * then if we don't put anything in the buffer it won't
+                 * be set, and at commit it it released into the cache,
+                 * and then a read will fail.
+                 */
+                else if (!(XFS_BUF_ISDONE(bp)))
+                  XFS_BUF_DONE(bp);
+        }
+        ASSERT(last < off);
+}
+/*
+ * Release dabuf from a transaction.
+ * Have to free up the dabuf before the buffers are released,
+ * since the synchronization on the dabuf is really the lock on the buffer.
+ */
+void
+xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+        xfs_buf_t       *bp;
+        xfs_buf_t       **bplist;
+        int             i;
+        int             nbuf;
+        ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+        if ((nbuf = dabuf->nbuf) == 1) {
+                bplist = &bp;
+                bp = dabuf->bps[0];
+        } else {
+                bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+                memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
+        }
+        xfs_da_buf_done(dabuf);
+        for (i = 0; i < nbuf; i++)
+                xfs_trans_brelse(tp, bplist[i]);
+        if (bplist != &bp)
+                kmem_free(bplist, nbuf * sizeof(*bplist));
+}
+/*
+ * Invalidate dabuf from a transaction.
+ */
+void
+xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+        xfs_buf_t       *bp;
+        xfs_buf_t       **bplist;
+        int             i;
+        int             nbuf;
+        ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+        if ((nbuf = dabuf->nbuf) == 1) {
+                bplist = &bp;
+                bp = dabuf->bps[0];
+        } else {
+                bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+                memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist));
+        }
+        xfs_da_buf_done(dabuf);
+        for (i = 0; i < nbuf; i++)
+                xfs_trans_binval(tp, bplist[i]);
+        if (bplist != &bp)
+                kmem_free(bplist, nbuf * sizeof(*bplist));
+}
+/*
+ * Get the first daddr from a dabuf.
+ */
+xfs_daddr_t
+xfs_da_blkno(xfs_dabuf_t *dabuf)
+{
+        ASSERT(dabuf->nbuf);
+        ASSERT(dabuf->data);
+        return XFS_BUF_ADDR(dabuf->bps[0]);
+}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
new file mode 100644
index 000000000000..9fc699d96995
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define __XFS_DA_BTREE_H__
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct zone;
+/*========================================================================
+ * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * Is is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC       0xfebe  /* magic number: non-leaf blocks */
+#define XFS_DIR_LEAF_MAGIC      0xfeeb  /* magic number: directory leaf blks */
+#define XFS_ATTR_LEAF_MAGIC     0xfbee  /* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC    0xd2f1  /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC    0xd2ff  /* magic number: v2 dirlf multi blks */
+#define XFS_DIRX_LEAF_MAGIC(mp) \
+        (XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
+typedef struct xfs_da_blkinfo {
+        xfs_dablk_t forw;                       /* previous block in list */
+        xfs_dablk_t back;                       /* following block in list */
+        __uint16_t magic;                       /* validity check on block */
+        __uint16_t pad;                         /* unused */
+} xfs_da_blkinfo_t;
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH    5       /* max depth of Btree */
+typedef struct xfs_da_intnode {
+        struct xfs_da_node_hdr {        /* constant-structure header block */
+                xfs_da_blkinfo_t info;  /* block type, links, etc. */
+                __uint16_t count;       /* count of active entries */
+                __uint16_t level;       /* level above leaves (leaf == 0) */
+        } hdr;
+        struct xfs_da_node_entry {
+                xfs_dahash_t hashval;   /* hash value for this descendant */
+                xfs_dablk_t before;     /* Btree block before this key */
+        } btree[1];                     /* variable sized array of keys */
+} xfs_da_intnode_t;
+typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
+typedef struct xfs_da_node_entry xfs_da_node_entry_t;
+#define XFS_DA_MAXHASH  ((xfs_dahash_t)-1) /* largest valid hash value */
+/*
+ * Macros used by directory code to interface to the filesystem.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBSIZE)
+int xfs_lbsize(struct xfs_mount *mp);
+#define XFS_LBSIZE(mp)                  xfs_lbsize(mp)
+#else
+#define XFS_LBSIZE(mp)  ((mp)->m_sb.sb_blocksize)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBLOG)
+int xfs_lblog(struct xfs_mount *mp);
+#define XFS_LBLOG(mp)                   xfs_lblog(mp)
+#else
+#define XFS_LBLOG(mp)   ((mp)->m_sb.sb_blocklog)
+#endif
+/*
+ * Macros used by directory code to interface to the kernel
+ */
+/*
+ * Macros used to manipulate directory off_t's
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_BNOENTRY)
+__uint32_t xfs_da_make_bnoentry(struct xfs_mount *mp, xfs_dablk_t bno,
+                                int entry);
+#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry)      \
+        xfs_da_make_bnoentry(mp,bno,entry)
+#else
+#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
+        (((bno) << (mp)->m_dircook_elog) | (entry))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_COOKIE)
+xfs_off_t xfs_da_make_cookie(struct xfs_mount *mp, xfs_dablk_t bno, int entry,
+                                xfs_dahash_t hash);
+#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)   \
+        xfs_da_make_cookie(mp,bno,entry,hash)
+#else
+#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
+        (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_HASH)
+xfs_dahash_t xfs_da_cookie_hash(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_HASH(mp,cookie)           xfs_da_cookie_hash(mp,cookie)
+#else
+#define XFS_DA_COOKIE_HASH(mp,cookie)   ((xfs_dahash_t)(cookie))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_BNO)
+xfs_dablk_t xfs_da_cookie_bno(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_BNO(mp,cookie)            xfs_da_cookie_bno(mp,cookie)
+#else
+#define XFS_DA_COOKIE_BNO(mp,cookie) \
+        (((xfs_off_t)(cookie) >> 31) == -1LL ? \
+                (xfs_dablk_t)0 : \
+                (xfs_dablk_t)((xfs_off_t)(cookie) >> ((mp)->m_dircook_elog + 32)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_ENTRY)
+int xfs_da_cookie_entry(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_ENTRY(mp,cookie)          xfs_da_cookie_entry(mp,cookie)
+#else
+#define XFS_DA_COOKIE_ENTRY(mp,cookie) \
+        (((xfs_off_t)(cookie) >> 31) == -1LL ? \
+                (xfs_dablk_t)0 : \
+                (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
+                              ((1 << (mp)->m_dircook_elog) - 1)))
+#endif
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+        uchar_t         *name;          /* string (maybe not NULL terminated) */
+        int             namelen;        /* length of string (maybe no NULL) */
+        uchar_t         *value;         /* set of bytes (maybe contain NULLs) */
+        int             valuelen;       /* length of value */
+        int             flags;          /* argument flags (eg: ATTR_NOCREATE) */
+        xfs_dahash_t    hashval;        /* hash value of name */
+        xfs_ino_t       inumber;        /* input/output inode number */
+        struct xfs_inode *dp;           /* directory inode to manipulate */
+        xfs_fsblock_t   *firstblock;    /* ptr to firstblock for bmap calls */
+        struct xfs_bmap_free *flist;    /* ptr to freelist for bmap_finish */
+        struct xfs_trans *trans;        /* current trans (changes over time) */
+        xfs_extlen_t    total;          /* total blocks needed, for 1st bmap */
+        int             whichfork;      /* data or attribute fork */
+        xfs_dablk_t     blkno;          /* blkno of attr leaf of interest */
+        int             index;          /* index of attr of interest in blk */
+        xfs_dablk_t     rmtblkno;       /* remote attr value starting blkno */
+        int             rmtblkcnt;      /* remote attr value block count */
+        xfs_dablk_t     blkno2;         /* blkno of 2nd attr leaf of interest */
+        int             index2;         /* index of 2nd attr in blk */
+        xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
+        int             rmtblkcnt2;     /* remote attr value block count */
+        unsigned char   justcheck;      /* T/F: check for ok with no space */
+        unsigned char   rename;         /* T/F: this is an atomic rename op */
+        unsigned char   addname;        /* T/F: this is an add operation */
+        unsigned char   oknoent;        /* T/F: ok to return ENOENT, else die */
+} xfs_da_args_t;
+/*
+ * Structure to describe buffer(s) for a block.
+ * This is needed in the directory version 2 format case, when
+ * multiple non-contiguous fsblocks might be needed to cover one
+ * logical directory block.
+ * If the buffer count is 1 then the data pointer points to the
+ * same place as the b_addr field for the buffer, else to kmem_alloced memory.
+ */
+typedef struct xfs_dabuf {
+        int             nbuf;           /* number of buffer pointers present */
+        short           dirty;          /* data needs to be copied back */
+        short           bbcount;        /* how large is data in bbs */
+        void            *data;          /* pointer for buffers' data */
+#ifdef XFS_DABUF_DEBUG
+        inst_t          *ra;            /* return address of caller to make */
+        struct xfs_dabuf *next;         /* next in global chain */
+        struct xfs_dabuf *prev;         /* previous in global chain */
+        struct xfs_buftarg *target;     /* device for buffer */
+        xfs_daddr_t     blkno;          /* daddr first in bps[0] */
+#endif
+        struct xfs_buf  *bps[1];        /* actually nbuf of these */
+} xfs_dabuf_t;
+#define XFS_DA_BUF_SIZE(n)      \
+        (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
+#ifdef XFS_DABUF_DEBUG
+extern xfs_dabuf_t      *xfs_dabuf_global_list;
+#endif
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+        xfs_dabuf_t     *bp;            /* buffer containing block */
+        xfs_dablk_t     blkno;          /* filesystem blkno of buffer */
+        xfs_daddr_t     disk_blkno;     /* on-disk blkno (in BBs) of buffer */
+        int             index;          /* relevant index into block */
+        xfs_dahash_t    hashval;        /* last hash value in block */
+        int             magic;          /* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+typedef struct xfs_da_state_path {
+        int                     active;         /* number of active levels */
+        xfs_da_state_blk_t      blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+typedef struct xfs_da_state {
+        xfs_da_args_t           *args;          /* filename arguments */
+        struct xfs_mount        *mp;            /* filesystem mount point */
+        unsigned int            blocksize;      /* logical block size */
+        unsigned int            node_ents;      /* how many entries in danode */
+        xfs_da_state_path_t     path;           /* search/split paths */
+        xfs_da_state_path_t     altpath;        /* alternate path for join */
+        unsigned char           inleaf;         /* insert into 1->lf, 0->splf */
+        unsigned char           extravalid;     /* T/F: extrablk is in use */
+        unsigned char           extraafter;     /* T/F: extrablk is after new */
+        xfs_da_state_blk_t      extrablk;       /* for double-splits on leafs */
+                                                /* for dirv2 extrablk is data */
+} xfs_da_state_t;
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)       ((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)       \
+                (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+                (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+#ifdef __KERNEL__
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Routines used for growing the Btree.
+ */
+int     xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+                                         xfs_dabuf_t **bpp, int whichfork);
+int     xfs_da_split(xfs_da_state_t *state);
+/*
+ * Routines used for shrinking the Btree.
+ */
+int     xfs_da_join(xfs_da_state_t *state);
+void    xfs_da_fixhashpath(xfs_da_state_t *state,
+                                          xfs_da_state_path_t *path_to_to_fix);
+/*
+ * Routines used for finding things in the Btree.
+ */
+int     xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
+int     xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+                                         int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int     xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+                                         xfs_da_state_blk_t *save_blk);
+int     xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+                                       xfs_da_state_blk_t *new_blk);
+/*
+ * Utility routines.
+ */
+int     xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int     xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+                              xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                              xfs_dabuf_t **bp, int whichfork);
+int     xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+                               xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                               xfs_dabuf_t **bpp, int whichfork);
+xfs_daddr_t     xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+                        xfs_dablk_t bno, int whichfork);
+int     xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+                                          xfs_dabuf_t *dead_buf);
+uint xfs_da_hashname(uchar_t *name_string, int name_length);
+uint xfs_da_log2_roundup(uint i);
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_kill_altpath(xfs_da_state_t *state);
+void xfs_da_buf_done(xfs_dabuf_t *dabuf);
+void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
+                           uint last);
+void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
+extern struct kmem_zone *xfs_da_state_zone;
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
new file mode 100644
index 000000000000..08d551a17347
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_dfrag.h"
+#include "xfs_error.h"
+#include "xfs_mac.h"
+#include "xfs_rw.h"
+/*
+ * Syssgi interface for swapext
+ */
+int
+xfs_swapext(
+        xfs_swapext_t   __user *sxp)
+{
+        xfs_swapext_t   sx;
+        xfs_inode_t     *ip=NULL, *tip=NULL, *ips[2];
+        xfs_trans_t     *tp;
+        xfs_mount_t     *mp;
+        xfs_bstat_t     *sbp;
+        struct file     *fp = NULL, *tfp = NULL;
+        vnode_t         *vp, *tvp;
+        bhv_desc_t      *bdp, *tbdp;
+        vn_bhv_head_t   *bhp, *tbhp;
+        uint            lock_flags=0;
+        int             ilf_fields, tilf_fields;
+        int             error = 0;
+        xfs_ifork_t     tempif, *ifp, *tifp;
+        __uint64_t      tmp;
+        int             aforkblks = 0;
+        int             taforkblks = 0;
+        int             locked = 0;
+        if (copy_from_user(&sx, sxp, sizeof(sx)))
+                return XFS_ERROR(EFAULT);
+        /* Pull information for the target fd */
+        if (((fp = fget((int)sx.sx_fdtarget)) == NULL) ||
+            ((vp = LINVFS_GET_VP(fp->f_dentry->d_inode)) == NULL))  {
+                error = XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        bhp = VN_BHV_HEAD(vp);
+        bdp = vn_bhv_lookup(bhp, &xfs_vnodeops);
+        if (bdp == NULL) {
+                error = XFS_ERROR(EBADF);
+                goto error0;
+        } else {
+                ip = XFS_BHVTOI(bdp);
+        }
+        if (((tfp = fget((int)sx.sx_fdtmp)) == NULL) ||
+            ((tvp = LINVFS_GET_VP(tfp->f_dentry->d_inode)) == NULL)) {
+                error = XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        tbhp = VN_BHV_HEAD(tvp);
+        tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops);
+        if (tbdp == NULL) {
+                error = XFS_ERROR(EBADF);
+                goto error0;
+        } else {
+                tip = XFS_BHVTOI(tbdp);
+        }
+        if (ip->i_mount != tip->i_mount) {
+                error =  XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        if (ip->i_ino == tip->i_ino) {
+                error =  XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        mp = ip->i_mount;
+        sbp = &sx.sx_stat;
+        if (XFS_FORCED_SHUTDOWN(mp)) {
+                error =  XFS_ERROR(EIO);
+                goto error0;
+        }
+        locked = 1;
+        /* Lock in i_ino order */
+        if (ip->i_ino < tip->i_ino) {
+                ips[0] = ip;
+                ips[1] = tip;
+        } else {
+                ips[0] = tip;
+                ips[1] = ip;
+        }
+        lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+        xfs_lock_inodes(ips, 2, 0, lock_flags);
+        /* Check permissions */
+        error = xfs_iaccess(ip, S_IWUSR, NULL);
+        if (error)
+                goto error0;
+        error = xfs_iaccess(tip, S_IWUSR, NULL);
+        if (error)
+                goto error0;
+        /* Verify that both files have the same format */
+        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
+                error = XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        /* Verify both files are either real-time or non-realtime */
+        if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
+            (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+                error = XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        /* Should never get a local format */
+        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                error = XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        if (VN_CACHED(tvp) != 0)
+                xfs_inval_cached_pages(XFS_ITOV(tip), &(tip->i_iocore),
+                                                (loff_t)0, 0, 0);
+        /* Verify O_DIRECT for ftmp */
+        if (VN_CACHED(tvp) != 0) {
+                error = XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        /* Verify all data are being swapped */
+        if (sx.sx_offset != 0 ||
+            sx.sx_length != ip->i_d.di_size ||
+            sx.sx_length != tip->i_d.di_size) {
+                error = XFS_ERROR(EFAULT);
+                goto error0;
+        }
+        /*
+         * If the target has extended attributes, the tmp file
+         * must also in order to ensure the correct data fork
+         * format.
+         */
+        if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+                error = XFS_ERROR(EINVAL);
+                goto error0;
+        }
+        /*
+         * Compare the current change & modify times with that
+         * passed in.  If they differ, we abort this swap.
+         * This is the mechanism used to ensure the calling
+         * process that the file was not changed out from
+         * under it.
+         */
+        if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
+            (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
+            (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
+            (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
+                error = XFS_ERROR(EBUSY);
+                goto error0;
+        }
+        /* We need to fail if the file is memory mapped.  Once we have tossed
+         * all existing pages, the page fault will have no option
+         * but to go to the filesystem for pages. By making the page fault call
+         * VOP_READ (or write in the case of autogrow) they block on the iolock
+         * until we have switched the extents.
+         */
+        if (VN_MAPPED(vp)) {
+                error = XFS_ERROR(EBUSY);
+                goto error0;
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_iunlock(tip, XFS_ILOCK_EXCL);
+        /*
+         * There is a race condition here since we gave up the
+         * ilock.  However, the data fork will not change since
+         * we have the iolock (locked for truncation too) so we
+         * are safe.  We don't really care if non-io related
+         * fields change.
+         */
+        VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+        if ((error = xfs_trans_reserve(tp, 0,
+                                     XFS_ICHANGE_LOG_RES(mp), 0,
+                                     0, 0))) {
+                xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
+                xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+        /*
+         * Count the number of extended attribute blocks
+         */
+        if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+             (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+                error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+                if (error) {
+                        xfs_iunlock(ip,  lock_flags);
+                        xfs_iunlock(tip, lock_flags);
+                        xfs_trans_cancel(tp, 0);
+                        return error;
+                }
+        }
+        if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+             (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+                error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+                        &taforkblks);
+                if (error) {
+                        xfs_iunlock(ip,  lock_flags);
+                        xfs_iunlock(tip, lock_flags);
+                        xfs_trans_cancel(tp, 0);
+                        return error;
+                }
+        }
+        /*
+         * Swap the data forks of the inodes
+         */
+        ifp = &ip->i_df;
+        tifp = &tip->i_df;
+        tempif = *ifp;  /* struct copy */
+        *ifp = *tifp;   /* struct copy */
+        *tifp = tempif; /* struct copy */
+        /*
+         * Fix the on-disk inode values
+         */
+        tmp = (__uint64_t)ip->i_d.di_nblocks;
+        ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+        tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+        tmp = (__uint64_t) ip->i_d.di_nextents;
+        ip->i_d.di_nextents = tip->i_d.di_nextents;
+        tip->i_d.di_nextents = tmp;
+        tmp = (__uint64_t) ip->i_d.di_format;
+        ip->i_d.di_format = tip->i_d.di_format;
+        tip->i_d.di_format = tmp;
+        ilf_fields = XFS_ILOG_CORE;
+        switch(ip->i_d.di_format) {
+        case XFS_DINODE_FMT_EXTENTS:
+                /* If the extents fit in the inode, fix the
+                 * pointer.  Otherwise it's already NULL or
+                 * pointing to the extent.
+                 */
+                if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+                        ifp->if_u1.if_extents =
+                                ifp->if_u2.if_inline_ext;
+                }
+                ilf_fields |= XFS_ILOG_DEXT;
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                ilf_fields |= XFS_ILOG_DBROOT;
+                break;
+        }
+        tilf_fields = XFS_ILOG_CORE;
+        switch(tip->i_d.di_format) {
+        case XFS_DINODE_FMT_EXTENTS:
+                /* If the extents fit in the inode, fix the
+                 * pointer.  Otherwise it's already NULL or
+                 * pointing to the extent.
+                 */
+                if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+                        tifp->if_u1.if_extents =
+                                tifp->if_u2.if_inline_ext;
+                }
+                tilf_fields |= XFS_ILOG_DEXT;
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                tilf_fields |= XFS_ILOG_DBROOT;
+                break;
+        }
+        /*
+         * Increment vnode ref counts since xfs_trans_commit &
+         * xfs_trans_cancel will both unlock the inodes and
+         * decrement the associated ref counts.
+         */
+        VN_HOLD(vp);
+        VN_HOLD(tvp);
+        xfs_trans_ijoin(tp, ip, lock_flags);
+        xfs_trans_ijoin(tp, tip, lock_flags);
+        xfs_trans_log_inode(tp, ip,  ilf_fields);
+        xfs_trans_log_inode(tp, tip, tilf_fields);
+        /*
+         * If this is a synchronous mount, make sure that the
+         * transaction goes to disk before returning to the user.
+         */
+        if (mp->m_flags & XFS_MOUNT_WSYNC) {
+                xfs_trans_set_sync(tp);
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL);
+        fput(fp);
+        fput(tfp);
+        return error;
+ error0:
+        if (locked) {
+                xfs_iunlock(ip,  lock_flags);
+                xfs_iunlock(tip, lock_flags);
+        }
+        if (fp != NULL) fput(fp);
+        if (tfp != NULL) fput(tfp);
+        return error;
+}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
new file mode 100644
index 000000000000..904860594b8f
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DFRAG_H__
+#define __XFS_DFRAG_H__
+/*
+ * Structure passed to xfs_swapext
+ */
+typedef struct xfs_swapext
+{
+        __int64_t       sx_version;     /* version */
+        __int64_t       sx_fdtarget;    /* fd of target file */
+        __int64_t       sx_fdtmp;       /* fd of tmp file */
+        xfs_off_t       sx_offset;      /* offset into file */
+        xfs_off_t       sx_length;      /* leng from offset */
+        char            sx_pad[16];     /* pad space, unused */
+        xfs_bstat_t     sx_stat;        /* stat of target b4 copy */
+} xfs_swapext_t;
+/*
+ * Version flag
+ */
+#define XFS_SX_VERSION          0
+#ifdef __KERNEL__
+/*
+ * Prototypes for visible xfs_dfrag.c routines.
+ */
+/*
+ * Syscall interface for xfs_swapext
+ */
+int     xfs_swapext(struct xfs_swapext __user *sx);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
new file mode 100644
index 000000000000..f5c932b064e6
--- /dev/null
+++ b/fs/xfs/xfs_dinode.h
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DINODE_H__
+#define __XFS_DINODE_H__
+struct xfs_buf;
+struct xfs_mount;
+#define XFS_DINODE_VERSION_1    1
+#define XFS_DINODE_VERSION_2    2
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DINODE_GOOD_VERSION)
+int xfs_dinode_good_version(int v);
+#define XFS_DINODE_GOOD_VERSION(v)      xfs_dinode_good_version(v)
+#else
+#define XFS_DINODE_GOOD_VERSION(v)      (((v) == XFS_DINODE_VERSION_1) || \
+                                         ((v) == XFS_DINODE_VERSION_2))
+#endif
+#define XFS_DINODE_MAGIC        0x494e  /* 'IN' */
+/*
+ * Disk inode structure.
+ * This is just the header; the inode is expanded to fill a variable size
+ * with the last field expanding.  It is split into the core and "other"
+ * because we only need the core part in the in-core inode.
+ */
+typedef struct xfs_timestamp {
+        __int32_t       t_sec;          /* timestamp seconds */
+        __int32_t       t_nsec;         /* timestamp nanoseconds */
+} xfs_timestamp_t;
+/*
+ * Note: Coordinate changes to this structure with the XFS_DI_* #defines
+ * below and the offsets table in xfs_ialloc_log_di().
+ */
+typedef struct xfs_dinode_core
+{
+        __uint16_t      di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
+        __uint16_t      di_mode;        /* mode and type of file */
+        __int8_t        di_version;     /* inode version */
+        __int8_t        di_format;      /* format of di_c data */
+        __uint16_t      di_onlink;      /* old number of links to file */
+        __uint32_t      di_uid;         /* owner's user id */
+        __uint32_t      di_gid;         /* owner's group id */
+        __uint32_t      di_nlink;       /* number of links to file */
+        __uint16_t      di_projid;      /* owner's project id */
+        __uint8_t       di_pad[8];      /* unused, zeroed space */
+        __uint16_t      di_flushiter;   /* incremented on flush */
+        xfs_timestamp_t di_atime;       /* time last accessed */
+        xfs_timestamp_t di_mtime;       /* time last modified */
+        xfs_timestamp_t di_ctime;       /* time created/inode modified */
+        xfs_fsize_t     di_size;        /* number of bytes in file */
+        xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
+        xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
+        xfs_extnum_t    di_nextents;    /* number of extents in data fork */
+        xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
+        __uint8_t       di_forkoff;     /* attr fork offs, <<3 for 64b align */
+        __int8_t        di_aformat;     /* format of attr fork's data */
+        __uint32_t      di_dmevmask;    /* DMIG event mask */
+        __uint16_t      di_dmstate;     /* DMIG state info */
+        __uint16_t      di_flags;       /* random flags, XFS_DIFLAG_... */
+        __uint32_t      di_gen;         /* generation number */
+} xfs_dinode_core_t;
+#define DI_MAX_FLUSH 0xffff
+typedef struct xfs_dinode
+{
+        xfs_dinode_core_t       di_core;
+        /*
+         * In adding anything between the core and the union, be
+         * sure to update the macros like XFS_LITINO below and
+         * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
+         */
+        xfs_agino_t             di_next_unlinked;/* agi unlinked list ptr */
+        union {
+                xfs_bmdr_block_t di_bmbt;       /* btree root block */
+                xfs_bmbt_rec_32_t di_bmx[1];    /* extent list */
+                xfs_dir_shortform_t di_dirsf;   /* shortform directory */
+                xfs_dir2_sf_t   di_dir2sf;      /* shortform directory v2 */
+                char            di_c[1];        /* local contents */
+                xfs_dev_t       di_dev;         /* device for S_IFCHR/S_IFBLK */
+                uuid_t          di_muuid;       /* mount point value */
+                char            di_symlink[1];  /* local symbolic link */
+        }               di_u;
+        union {
+                xfs_bmdr_block_t di_abmbt;      /* btree root block */
+                xfs_bmbt_rec_32_t di_abmx[1];   /* extent list */
+                xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
+        }               di_a;
+} xfs_dinode_t;
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define XFS_MAXLINK             ((1U << 31) - 1U)
+#define XFS_MAXLINK_1           65535U
+/*
+ * Bit names for logging disk inodes only
+ */
+#define XFS_DI_MAGIC            0x0000001
+#define XFS_DI_MODE             0x0000002
+#define XFS_DI_VERSION          0x0000004
+#define XFS_DI_FORMAT           0x0000008
+#define XFS_DI_ONLINK           0x0000010
+#define XFS_DI_UID              0x0000020
+#define XFS_DI_GID              0x0000040
+#define XFS_DI_NLINK            0x0000080
+#define XFS_DI_PROJID           0x0000100
+#define XFS_DI_PAD              0x0000200
+#define XFS_DI_ATIME            0x0000400
+#define XFS_DI_MTIME            0x0000800
+#define XFS_DI_CTIME            0x0001000
+#define XFS_DI_SIZE             0x0002000
+#define XFS_DI_NBLOCKS          0x0004000
+#define XFS_DI_EXTSIZE          0x0008000
+#define XFS_DI_NEXTENTS         0x0010000
+#define XFS_DI_NAEXTENTS        0x0020000
+#define XFS_DI_FORKOFF          0x0040000
+#define XFS_DI_AFORMAT          0x0080000
+#define XFS_DI_DMEVMASK         0x0100000
+#define XFS_DI_DMSTATE          0x0200000
+#define XFS_DI_FLAGS            0x0400000
+#define XFS_DI_GEN              0x0800000
+#define XFS_DI_NEXT_UNLINKED    0x1000000
+#define XFS_DI_U                0x2000000
+#define XFS_DI_A                0x4000000
+#define XFS_DI_NUM_BITS         27
+#define XFS_DI_ALL_BITS         ((1 << XFS_DI_NUM_BITS) - 1)
+#define XFS_DI_CORE_BITS        (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt
+{
+        XFS_DINODE_FMT_DEV,             /* CHR, BLK: di_dev */
+        XFS_DINODE_FMT_LOCAL,           /* DIR, REG: di_c */
+                                        /* LNK: di_symlink */
+        XFS_DINODE_FMT_EXTENTS,         /* DIR, REG, LNK: di_bmx */
+        XFS_DINODE_FMT_BTREE,           /* DIR, REG, LNK: di_bmbt */
+        XFS_DINODE_FMT_UUID             /* MNT: di_uuid */
+} xfs_dinode_fmt_t;
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG      8
+#define XFS_DINODE_MAX_LOG      11
+#define XFS_DINODE_MIN_SIZE     (1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE     (1 << XFS_DINODE_MAX_LOG)
+/*
+ * Inode size for given fs.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LITINO)
+int xfs_litino(struct xfs_mount *mp);
+#define XFS_LITINO(mp)          xfs_litino(mp)
+#else
+#define XFS_LITINO(mp)  ((mp)->m_litino)
+#endif
+#define XFS_BROOT_SIZE_ADJ      \
+        (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+/*
+ * Fork identifiers.  Here so utilities can use them without including
+ * xfs_inode.h.
+ */
+#define XFS_DATA_FORK   0
+#define XFS_ATTR_FORK   1
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_Q)
+int xfs_cfork_q_disk(xfs_dinode_core_t *dcp);
+int xfs_cfork_q(xfs_dinode_core_t *dcp);
+#define XFS_CFORK_Q_DISK(dcp)               xfs_cfork_q_disk(dcp)
+#define XFS_CFORK_Q(dcp)                    xfs_cfork_q(dcp)
+#else
+#define XFS_CFORK_Q_DISK(dcp)               ((dcp)->di_forkoff != 0)
+#define XFS_CFORK_Q(dcp)                    ((dcp)->di_forkoff != 0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_BOFF)
+int xfs_cfork_boff_disk(xfs_dinode_core_t *dcp);
+int xfs_cfork_boff(xfs_dinode_core_t *dcp);
+#define XFS_CFORK_BOFF_DISK(dcp)            xfs_cfork_boff_disk(dcp)
+#define XFS_CFORK_BOFF(dcp)                 xfs_cfork_boff(dcp)
+#else
+#define XFS_CFORK_BOFF_DISK(dcp)            ((int)(INT_GET((dcp)->di_forkoff, ARCH_CONVERT) << 3))
+#define XFS_CFORK_BOFF(dcp)                 ((int)((dcp)->di_forkoff << 3))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_DSIZE)
+int xfs_cfork_dsize_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+int xfs_cfork_dsize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+#define XFS_CFORK_DSIZE_DISK(dcp,mp)        xfs_cfork_dsize_disk(dcp,mp)
+#define XFS_CFORK_DSIZE(dcp,mp)             xfs_cfork_dsize(dcp,mp)
+#else
+#define XFS_CFORK_DSIZE_DISK(dcp,mp) \
+        (XFS_CFORK_Q_DISK(dcp) ? XFS_CFORK_BOFF_DISK(dcp) : XFS_LITINO(mp))
+#define XFS_CFORK_DSIZE(dcp,mp) \
+        (XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_ASIZE)
+int xfs_cfork_asize_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+int xfs_cfork_asize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+#define XFS_CFORK_ASIZE_DISK(dcp,mp)        xfs_cfork_asize_disk(dcp,mp)
+#define XFS_CFORK_ASIZE(dcp,mp)             xfs_cfork_asize(dcp,mp)
+#else
+#define XFS_CFORK_ASIZE_DISK(dcp,mp) \
+        (XFS_CFORK_Q_DISK(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_DISK(dcp) : 0)
+#define XFS_CFORK_ASIZE(dcp,mp) \
+        (XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_SIZE)
+int xfs_cfork_size_disk(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w);
+int xfs_cfork_size(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w);
+#define XFS_CFORK_SIZE_DISK(dcp,mp,w)       xfs_cfork_size_disk(dcp,mp,w)
+#define XFS_CFORK_SIZE(dcp,mp,w)            xfs_cfork_size(dcp,mp,w)
+#else
+#define XFS_CFORK_SIZE_DISK(dcp,mp,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_CFORK_DSIZE_DISK(dcp, mp) : \
+                XFS_CFORK_ASIZE_DISK(dcp, mp))
+#define XFS_CFORK_SIZE(dcp,mp,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DSIZE)
+int xfs_dfork_dsize(xfs_dinode_t *dip, struct xfs_mount *mp);
+#define XFS_DFORK_DSIZE(dip,mp)             xfs_dfork_dsize(dip,mp)
+#else
+#define XFS_DFORK_DSIZE(dip,mp)             XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_ASIZE)
+int xfs_dfork_asize(xfs_dinode_t *dip, struct xfs_mount *mp);
+#define XFS_DFORK_ASIZE(dip,mp)             xfs_dfork_asize(dip,mp)
+#else
+#define XFS_DFORK_ASIZE(dip,mp)             XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_SIZE)
+int xfs_dfork_size(xfs_dinode_t *dip, struct xfs_mount *mp, int w);
+#define XFS_DFORK_SIZE(dip,mp,w)            xfs_dfork_size(dip,mp,w)
+#else
+#define XFS_DFORK_SIZE(dip,mp,w)            XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w)
+#endif
+/*
+ * Macros for accessing per-fork disk inode information.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_Q)
+int xfs_dfork_q(xfs_dinode_t *dip);
+#define XFS_DFORK_Q(dip)                    xfs_dfork_q(dip)
+#else
+#define XFS_DFORK_Q(dip)                    XFS_CFORK_Q_DISK(&(dip)->di_core)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_BOFF)
+int xfs_dfork_boff(xfs_dinode_t *dip);
+#define XFS_DFORK_BOFF(dip)                 xfs_dfork_boff(dip)
+#else
+#define XFS_DFORK_BOFF(dip)                 XFS_CFORK_BOFF_DISK(&(dip)->di_core)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DPTR)
+char *xfs_dfork_dptr(xfs_dinode_t *dip);
+#define XFS_DFORK_DPTR(dip)                 xfs_dfork_dptr(dip)
+#else
+#define XFS_DFORK_DPTR(dip)                 ((dip)->di_u.di_c)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_APTR)
+char *xfs_dfork_aptr(xfs_dinode_t *dip);
+#define XFS_DFORK_APTR(dip)                 xfs_dfork_aptr(dip)
+#else
+#define XFS_DFORK_APTR(dip)                 ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_PTR)
+char *xfs_dfork_ptr(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_PTR(dip,w)                xfs_dfork_ptr(dip,w)
+#else
+#define XFS_DFORK_PTR(dip,w)    \
+        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FORMAT)
+int xfs_cfork_format(xfs_dinode_core_t *dcp, int w);
+#define XFS_CFORK_FORMAT(dcp,w)             xfs_cfork_format(dcp,w)
+#else
+#define XFS_CFORK_FORMAT(dcp,w) \
+        ((w) == XFS_DATA_FORK ? (dcp)->di_format : (dcp)->di_aformat)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FMT_SET)
+void xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n);
+#define XFS_CFORK_FMT_SET(dcp,w,n)           xfs_cfork_fmt_set(dcp,w,n)
+#else
+#define XFS_CFORK_FMT_SET(dcp,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((dcp)->di_format = (n)) : \
+                ((dcp)->di_aformat = (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXTENTS)
+int xfs_cfork_nextents_disk(xfs_dinode_core_t *dcp, int w);
+int xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w);
+#define XFS_CFORK_NEXTENTS_DISK(dcp,w)       xfs_cfork_nextents_disk(dcp,w)
+#define XFS_CFORK_NEXTENTS(dcp,w)            xfs_cfork_nextents(dcp,w)
+#else
+#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \
+        ((w) == XFS_DATA_FORK ? \
+                INT_GET((dcp)->di_nextents, ARCH_CONVERT) : \
+                INT_GET((dcp)->di_anextents, ARCH_CONVERT))
+#define XFS_CFORK_NEXTENTS(dcp,w) \
+        ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXT_SET)
+void xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n);
+#define XFS_CFORK_NEXT_SET(dcp,w,n)             xfs_cfork_next_set(dcp,w,n)
+#else
+#define XFS_CFORK_NEXT_SET(dcp,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((dcp)->di_nextents = (n)) : \
+                ((dcp)->di_anextents = (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXTENTS)
+int xfs_dfork_nextents(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_NEXTENTS(dip,w) xfs_dfork_nextents(dip,w)
+#else
+#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_DINODE)
+xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp);
+#define XFS_BUF_TO_DINODE(bp)   xfs_buf_to_dinode(bp)
+#else
+#define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)(XFS_BUF_PTR(bp)))
+#endif
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT  0      /* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT  1      /* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT   2      /* for rtbitmap inode, new format */
+#define XFS_DIFLAG_IMMUTABLE_BIT 3      /* inode is immutable */
+#define XFS_DIFLAG_APPEND_BIT    4      /* inode is append-only */
+#define XFS_DIFLAG_SYNC_BIT      5      /* inode is written synchronously */
+#define XFS_DIFLAG_NOATIME_BIT   6      /* do not update atime */
+#define XFS_DIFLAG_NODUMP_BIT    7      /* do not dump */
+#define XFS_DIFLAG_RTINHERIT_BIT 8      /* create with realtime bit set */
+#define XFS_DIFLAG_PROJINHERIT_BIT  9   /* create with parents projid */
+#define XFS_DIFLAG_NOSYMLINKS_BIT  10   /* disallow symlink creation */
+#define XFS_DIFLAG_REALTIME      (1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC      (1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM       (1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_IMMUTABLE     (1 << XFS_DIFLAG_IMMUTABLE_BIT)
+#define XFS_DIFLAG_APPEND        (1 << XFS_DIFLAG_APPEND_BIT)
+#define XFS_DIFLAG_SYNC          (1 << XFS_DIFLAG_SYNC_BIT)
+#define XFS_DIFLAG_NOATIME       (1 << XFS_DIFLAG_NOATIME_BIT)
+#define XFS_DIFLAG_NODUMP        (1 << XFS_DIFLAG_NODUMP_BIT)
+#define XFS_DIFLAG_RTINHERIT     (1 << XFS_DIFLAG_RTINHERIT_BIT)
+#define XFS_DIFLAG_PROJINHERIT   (1 << XFS_DIFLAG_PROJINHERIT_BIT)
+#define XFS_DIFLAG_NOSYMLINKS    (1 << XFS_DIFLAG_NOSYMLINKS_BIT)
+#define XFS_DIFLAG_ANY \
+        (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \
+         XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \
+         XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \
+         XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS)
+#endif  /* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
new file mode 100644
index 000000000000..ba30bc7682f2
--- /dev/null
+++ b/fs/xfs/xfs_dir.c
@@ -0,0 +1,1223 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_error.h"
+/*
+ * xfs_dir.c
+ *
+ * Provide the external interfaces to manage directories.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Functions for the dirops interfaces.
+ */
+static void     xfs_dir_mount(struct xfs_mount *mp);
+static int      xfs_dir_isempty(struct xfs_inode *dp);
+static int      xfs_dir_init(struct xfs_trans *trans,
+                             struct xfs_inode *dir,
+                             struct xfs_inode *parent_dir);
+static int      xfs_dir_createname(struct xfs_trans *trans,
+                                   struct xfs_inode *dp,
+                                   char *name_string,
+                                   int name_len,
+                                   xfs_ino_t inode_number,
+                                   xfs_fsblock_t *firstblock,
+                                   xfs_bmap_free_t *flist,
+                                   xfs_extlen_t total);
+static int      xfs_dir_lookup(struct xfs_trans *tp,
+                               struct xfs_inode *dp,
+                               char *name_string,
+                               int name_length,
+                               xfs_ino_t *inode_number);
+static int      xfs_dir_removename(struct xfs_trans *trans,
+                                   struct xfs_inode *dp,
+                                   char *name_string,
+                                   int name_length,
+                                   xfs_ino_t ino,
+                                   xfs_fsblock_t *firstblock,
+                                   xfs_bmap_free_t *flist,
+                                   xfs_extlen_t total);
+static int      xfs_dir_getdents(struct xfs_trans *tp,
+                                 struct xfs_inode *dp,
+                                 struct uio *uiop,
+                                 int *eofp);
+static int      xfs_dir_replace(struct xfs_trans *tp,
+                                struct xfs_inode *dp,
+                                char *name_string,
+                                int name_length,
+                                xfs_ino_t inode_number,
+                                xfs_fsblock_t *firstblock,
+                                xfs_bmap_free_t *flist,
+                                xfs_extlen_t total);
+static int      xfs_dir_canenter(struct xfs_trans *tp,
+                                 struct xfs_inode *dp,
+                                 char *name_string,
+                                 int name_length);
+static int      xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
+                                                  xfs_dinode_t *dip);
+xfs_dirops_t xfsv1_dirops = {
+        .xd_mount                       = xfs_dir_mount,
+        .xd_isempty                     = xfs_dir_isempty,
+        .xd_init                        = xfs_dir_init,
+        .xd_createname                  = xfs_dir_createname,
+        .xd_lookup                      = xfs_dir_lookup,
+        .xd_removename                  = xfs_dir_removename,
+        .xd_getdents                    = xfs_dir_getdents,
+        .xd_replace                     = xfs_dir_replace,
+        .xd_canenter                    = xfs_dir_canenter,
+        .xd_shortform_validate_ondisk   = xfs_dir_shortform_validate_ondisk,
+        .xd_shortform_to_single         = xfs_dir_shortform_to_leaf,
+};
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
+STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
+                                                 int *total_namebytes);
+STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
+                                             uio_t *uio, int *eofp,
+                                             xfs_dirent_t *dbp,
+                                             xfs_dir_put_t put);
+STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
+/*
+ * Internal routines when dirsize > XFS_LBSIZE(mp).
+ */
+STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
+STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
+STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
+STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
+                                             uio_t *uio, int *eofp,
+                                             xfs_dirent_t *dbp,
+                                             xfs_dir_put_t put);
+STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
+#if defined(XFS_DIR_TRACE)
+ktrace_t *xfs_dir_trace_buf;
+#endif
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+xfs_dahash_t    xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+        xfs_dir_hash_dot = xfs_da_hashname(".", 1);
+        xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+}
+/*
+ * Initialize directory-related fields in the mount structure.
+ */
+static void
+xfs_dir_mount(xfs_mount_t *mp)
+{
+        uint shortcount, leafcount, count;
+        mp->m_dirversion = 1;
+        shortcount = (mp->m_attroffset - (uint)sizeof(xfs_dir_sf_hdr_t)) /
+                     (uint)sizeof(xfs_dir_sf_entry_t);
+        leafcount = (XFS_LBSIZE(mp) - (uint)sizeof(xfs_dir_leaf_hdr_t)) /
+                    ((uint)sizeof(xfs_dir_leaf_entry_t) +
+                     (uint)sizeof(xfs_dir_leaf_name_t));
+        count = shortcount > leafcount ? shortcount : leafcount;
+        mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
+        ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
+        mp->m_dir_node_ents = mp->m_attr_node_ents =
+                (XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
+                (uint)sizeof(xfs_da_node_entry_t);
+        mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
+        mp->m_dirblksize = mp->m_sb.sb_blocksize;
+        mp->m_dirblkfsbs = 1;
+}
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+static int
+xfs_dir_isempty(xfs_inode_t *dp)
+{
+        xfs_dir_sf_hdr_t *hdr;
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        if (dp->i_d.di_size == 0)
+                return(1);
+        if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+                return(0);
+        hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        return(hdr->count == 0);
+}
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+static int
+xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
+{
+        xfs_da_args_t args;
+        int error;
+        memset((char *)&args, 0, sizeof(args));
+        args.dp = dir;
+        args.trans = trans;
+        ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR);
+        if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
+                return error;
+        return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
+}
+/*
+ * Generic handler routine to add a name to a directory.
+ * Transitions directory from shortform to Btree as necessary.
+ */
+static int                                                      /* error */
+xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
+                   int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
+                   xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+        xfs_da_args_t args;
+        int retval, newsize, done;
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
+                return (retval);
+        XFS_STATS_INC(xs_dir_create);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = inum;
+        args.dp = dp;
+        args.firstblock = firstblock;
+        args.flist = flist;
+        args.total = total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = trans;
+        args.justcheck = 0;
+        args.addname = args.oknoent = 1;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        done = 0;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
+                if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
+                        retval = xfs_dir_shortform_addname(&args);
+                        done = 1;
+                } else {
+                        if (total == 0)
+                                return XFS_ERROR(ENOSPC);
+                        retval = xfs_dir_shortform_to_leaf(&args);
+                        done = retval != 0;
+                }
+        }
+        if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+                retval = xfs_dir_leaf_addname(&args);
+                done = retval != ENOSPC;
+                if (!done) {
+                        if (total == 0)
+                                return XFS_ERROR(ENOSPC);
+                        retval = xfs_dir_leaf_to_node(&args);
+                        done = retval != 0;
+                }
+        }
+        if (!done) {
+                retval = xfs_dir_node_addname(&args);
+        }
+        return(retval);
+}
+/*
+ * Generic handler routine to check if a name can be added to a directory,
+ * without adding any blocks to the directory.
+ */
+static int                                                      /* error */
+xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
+{
+        xfs_da_args_t args;
+        int retval, newsize;
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = 0;
+        args.dp = dp;
+        args.firstblock = NULL;
+        args.flist = NULL;
+        args.total = 0;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = trans;
+        args.justcheck = args.addname = args.oknoent = 1;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
+                if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
+                        retval = 0;
+                else
+                        retval = XFS_ERROR(ENOSPC);
+        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+                retval = xfs_dir_leaf_addname(&args);
+        } else {
+                retval = xfs_dir_node_addname(&args);
+        }
+        return(retval);
+}
+/*
+ * Generic handler routine to remove a name from a directory.
+ * Transitions directory from Btree to shortform as necessary.
+ */
+static int                                                      /* error */
+xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
+                   int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
+                   xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+        xfs_da_args_t args;
+        int count, totallen, newsize, retval;
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        XFS_STATS_INC(xs_dir_remove);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = ino;
+        args.dp = dp;
+        args.firstblock = firstblock;
+        args.flist = flist;
+        args.total = total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = trans;
+        args.justcheck = args.addname = args.oknoent = 0;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                retval = xfs_dir_shortform_removename(&args);
+        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+                retval = xfs_dir_leaf_removename(&args, &count, &totallen);
+                if (retval == 0) {
+                        newsize = XFS_DIR_SF_ALLFIT(count, totallen);
+                        if (newsize <= XFS_IFORK_DSIZE(dp)) {
+                                retval = xfs_dir_leaf_to_shortform(&args);
+                        }
+                }
+        } else {
+                retval = xfs_dir_node_removename(&args);
+        }
+        return(retval);
+}
+static int                                                      /* error */
+xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
+                                   xfs_ino_t *inum)
+{
+        xfs_da_args_t args;
+        int retval;
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        XFS_STATS_INC(xs_dir_lookup);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = 0;
+        args.dp = dp;
+        args.firstblock = NULL;
+        args.flist = NULL;
+        args.total = 0;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = trans;
+        args.justcheck = args.addname = 0;
+        args.oknoent = 1;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                retval = xfs_dir_shortform_lookup(&args);
+        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+                retval = xfs_dir_leaf_lookup(&args);
+        } else {
+                retval = xfs_dir_node_lookup(&args);
+        }
+        if (retval == EEXIST)
+                retval = 0;
+        *inum = args.inumber;
+        return(retval);
+}
+/*
+ * Implement readdir.
+ */
+static int                                                      /* error */
+xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
+{
+        xfs_dirent_t *dbp;
+        int  alignment, retval;
+        xfs_dir_put_t put;
+        XFS_STATS_INC(xs_dir_getdents);
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        /*
+         * If our caller has given us a single contiguous memory buffer,
+         * just work directly within that buffer.  If it's in user memory,
+         * lock it down first.
+         */
+        alignment = sizeof(xfs_off_t) - 1;
+        if ((uio->uio_iovcnt == 1) &&
+            (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
+            ((uio->uio_iov[0].iov_len & alignment) == 0)) {
+                dbp = NULL;
+                put = xfs_dir_put_dirent64_direct;
+        } else {
+                dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
+                put = xfs_dir_put_dirent64_uio;
+        }
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        *eofp = 0;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
+        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+                retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
+        } else {
+                retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
+        }
+        if (dbp != NULL)
+                kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
+        return(retval);
+}
+static int                                                      /* error */
+xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
+                                    xfs_ino_t inum, xfs_fsblock_t *firstblock,
+                                    xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+        xfs_da_args_t args;
+        int retval;
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
+                return retval;
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = inum;
+        args.dp = dp;
+        args.firstblock = firstblock;
+        args.flist = flist;
+        args.total = total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = trans;
+        args.justcheck = args.addname = args.oknoent = 0;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+                retval = xfs_dir_shortform_replace(&args);
+        } else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+                retval = xfs_dir_leaf_replace(&args);
+        } else {
+                retval = xfs_dir_node_replace(&args);
+        }
+        return(retval);
+}
+static int
+xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
+{
+        xfs_ino_t               ino;
+        int                     namelen_sum;
+        int                     count;
+        xfs_dir_shortform_t     *sf;
+        xfs_dir_sf_entry_t      *sfe;
+        int                     i;
+        if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) {
+                return 0;
+        }
+        if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
+                return 0;
+        }
+        if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
+                xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p",
+                        dp);
+                return 1;
+        }
+        sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
+        ino = XFS_GET_DIR_INO8(sf->hdr.parent);
+        if (xfs_dir_ino_validate(mp, ino))
+                return 1;
+        count = sf->hdr.count;
+        if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
+                xfs_fs_cmn_err(CE_WARN, mp,
+                        "Invalid shortform count: dp 0x%p", dp);
+                return(1);
+        }
+        if (count == 0) {
+                return 0;
+        }
+        namelen_sum = 0;
+        sfe = &sf->list[0];
+        for (i = sf->hdr.count - 1; i >= 0; i--) {
+                ino = XFS_GET_DIR_INO8(sfe->inumber);
+                xfs_dir_ino_validate(mp, ino);
+                if (sfe->namelen >= XFS_LITINO(mp)) {
+                        xfs_fs_cmn_err(CE_WARN, mp,
+                                "Invalid shortform namelen: dp 0x%p", dp);
+                        return 1;
+                }
+                namelen_sum += sfe->namelen;
+                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+        }
+        if (namelen_sum >= XFS_LITINO(mp)) {
+                xfs_fs_cmn_err(CE_WARN, mp,
+                        "Invalid shortform namelen: dp 0x%p", dp);
+                return 1;
+        }
+        return 0;
+}
+/*========================================================================
+ * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
+ *========================================================================*/
+/*
+ * Add a name to the leaf directory structure
+ * This is the external routine.
+ */
+int
+xfs_dir_leaf_addname(xfs_da_args_t *args)
+{
+        int index, retval;
+        xfs_dabuf_t *bp;
+        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+                                              XFS_DATA_FORK);
+        if (retval)
+                return(retval);
+        ASSERT(bp != NULL);
+        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+        if (retval == ENOENT)
+                retval = xfs_dir_leaf_add(bp, args, index);
+        xfs_da_buf_done(bp);
+        return(retval);
+}
+/*
+ * Remove a name from the leaf directory structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
+{
+        xfs_dir_leafblock_t *leaf;
+        int index, retval;
+        xfs_dabuf_t *bp;
+        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+                                              XFS_DATA_FORK);
+        if (retval)
+                return(retval);
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+        if (retval == EEXIST) {
+                (void)xfs_dir_leaf_remove(args->trans, bp, index);
+                *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                *totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+                retval = 0;
+        }
+        xfs_da_buf_done(bp);
+        return(retval);
+}
+/*
+ * Look up a name in a leaf directory structure.
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_lookup(xfs_da_args_t *args)
+{
+        int index, retval;
+        xfs_dabuf_t *bp;
+        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+                                              XFS_DATA_FORK);
+        if (retval)
+                return(retval);
+        ASSERT(bp != NULL);
+        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+        xfs_da_brelse(args->trans, bp);
+        return(retval);
+}
+/*
+ * Copy out directory entries for getdents(), for leaf directories.
+ */
+STATIC int
+xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
+                                  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+        xfs_dabuf_t *bp;
+        int retval, eob;
+        retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
+        if (retval)
+                return(retval);
+        ASSERT(bp != NULL);
+        retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
+        xfs_da_brelse(trans, bp);
+        *eofp = (eob == 0);
+        return(retval);
+}
+/*
+ * Look up a name in a leaf directory structure, replace the inode number.
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_replace(xfs_da_args_t *args)
+{
+        int index, retval;
+        xfs_dabuf_t *bp;
+        xfs_ino_t inum;
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_entry_t *entry;
+        xfs_dir_leaf_name_t *namest;
+        inum = args->inumber;
+        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+                                              XFS_DATA_FORK);
+        if (retval)
+                return(retval);
+        ASSERT(bp != NULL);
+        retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+        if (retval == EEXIST) {
+                leaf = bp->data;
+                entry = &leaf->entries[index];
+                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+                /* XXX - replace assert? */
+                XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
+                xfs_da_log_buf(args->trans, bp,
+                    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
+                xfs_da_buf_done(bp);
+                retval = 0;
+        } else
+                xfs_da_brelse(args->trans, bp);
+        return(retval);
+}
+/*========================================================================
+ * External routines when dirsize > XFS_LBSIZE(mp).
+ *========================================================================*/
+/*
+ * Add a name to a Btree-format directory.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_dir_node_addname(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        int retval, error;
+        /*
+         * Fill in bucket of arguments/results/context to carry around.
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_sb.sb_blocksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        /*
+         * Search to see if name already exists, and get back a pointer
+         * to where it should go.
+         */
+        error = xfs_da_node_lookup_int(state, &retval);
+        if (error)
+                retval = error;
+        if (retval != ENOENT)
+                goto error;
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+        retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
+        if (retval == 0) {
+                /*
+                 * Addition succeeded, update Btree hashvals.
+                 */
+                if (!args->justcheck)
+                        xfs_da_fixhashpath(state, &state->path);
+        } else {
+                /*
+                 * Addition failed, split as many Btree elements as required.
+                 */
+                if (args->total == 0) {
+                        ASSERT(retval == ENOSPC);
+                        goto error;
+                }
+                retval = xfs_da_split(state);
+        }
+error:
+        xfs_da_state_free(state);
+        return(retval);
+}
+/*
+ * Remove a name from a B-tree directory.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_dir_node_removename(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        int retval, error;
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_sb.sb_blocksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        /*
+         * Search to see if name exists, and get back a pointer to it.
+         */
+        error = xfs_da_node_lookup_int(state, &retval);
+        if (error)
+                retval = error;
+        if (retval != EEXIST) {
+                xfs_da_state_free(state);
+                return(retval);
+        }
+        /*
+         * Remove the name and update the hashvals in the tree.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+        retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
+        xfs_da_fixhashpath(state, &state->path);
+        /*
+         * Check to see if the tree needs to be collapsed.
+         */
+        error = 0;
+        if (retval) {
+                error = xfs_da_join(state);
+        }
+        xfs_da_state_free(state);
+        if (error)
+                return(error);
+        return(0);
+}
+/*
+ * Look up a filename in a int directory.
+ * Use an internal routine to actually do all the work.
+ */
+STATIC int
+xfs_dir_node_lookup(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        int retval, error, i;
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_sb.sb_blocksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        /*
+         * Search to see if name exists,
+         * and get back a pointer to it.
+         */
+        error = xfs_da_node_lookup_int(state, &retval);
+        if (error) {
+                retval = error;
+        }
+        /*
+         * If not in a transaction, we have to release all the buffers.
+         */
+        for (i = 0; i < state->path.active; i++) {
+                xfs_da_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return(retval);
+}
+STATIC int
+xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
+                                  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+        xfs_da_intnode_t *node;
+        xfs_da_node_entry_t *btree;
+        xfs_dir_leafblock_t *leaf = NULL;
+        xfs_dablk_t bno, nextbno;
+        xfs_dahash_t cookhash;
+        xfs_mount_t *mp;
+        int error, eob, i;
+        xfs_dabuf_t *bp;
+        xfs_daddr_t nextda;
+        /*
+         * Pick up our context.
+         */
+        mp = dp->i_mount;
+        bp = NULL;
+        bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
+        cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+        xfs_dir_trace_g_du("node: start", dp, uio);
+        /*
+         * Re-find our place, even if we're confused about what our place is.
+         *
+         * First we check the block number from the magic cookie, it is a
+         * cache of where we ended last time.  If we find a leaf block, and
+         * the starting hashval in that block is less than our desired
+         * hashval, then we run with it.
+         */
+        if (bno > 0) {
+                error = xfs_da_read_buf(trans, dp, bno, -2, &bp, XFS_DATA_FORK);
+                if ((error != 0) && (error != EFSCORRUPTED))
+                        return(error);
+                if (bp)
+                        leaf = bp->data;
+                if (bp && INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+                        xfs_dir_trace_g_dub("node: block not a leaf",
+                                                   dp, uio, bno);
+                        xfs_da_brelse(trans, bp);
+                        bp = NULL;
+                }
+                if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
+                        xfs_dir_trace_g_dub("node: leaf hash too large",
+                                                   dp, uio, bno);
+                        xfs_da_brelse(trans, bp);
+                        bp = NULL;
+                }
+                if (bp &&
+                    cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
+                        xfs_dir_trace_g_dub("node: leaf hash too small",
+                                                   dp, uio, bno);
+                        xfs_da_brelse(trans, bp);
+                        bp = NULL;
+                }
+        }
+        /*
+         * If we did not find a leaf block from the blockno in the cookie,
+         * or we there was no blockno in the cookie (eg: first time thru),
+         * the we start at the top of the Btree and re-find our hashval.
+         */
+        if (bp == NULL) {
+                xfs_dir_trace_g_du("node: start at root" , dp, uio);
+                bno = 0;
+                for (;;) {
+                        error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
+                                                       XFS_DATA_FORK);
+                        if (error)
+                                return(error);
+                        if (bp == NULL)
+                                return(XFS_ERROR(EFSCORRUPTED));
+                        node = bp->data;
+                        if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)
+                                break;
+                        btree = &node->btree[0];
+                        xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
+                        for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); btree++, i++) {
+                                if (INT_GET(btree->hashval, ARCH_CONVERT) >= cookhash) {
+                                        bno = INT_GET(btree->before, ARCH_CONVERT);
+                                        break;
+                                }
+                        }
+                        if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
+                                xfs_da_brelse(trans, bp);
+                                xfs_dir_trace_g_du("node: hash beyond EOF",
+                                                          dp, uio);
+                                uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
+                                                             XFS_DA_MAXHASH);
+                                *eofp = 1;
+                                return(0);
+                        }
+                        xfs_dir_trace_g_dub("node: going to block",
+                                                   dp, uio, bno);
+                        xfs_da_brelse(trans, bp);
+                }
+        }
+        ASSERT(cookhash != XFS_DA_MAXHASH);
+        /*
+         * We've dropped down to the (first) leaf block that contains the
+         * hashval we are interested in.  Continue rolling upward thru the
+         * leaf blocks until we fill up our buffer.
+         */
+        for (;;) {
+                leaf = bp->data;
+                if (unlikely(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC)) {
+                        xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
+                        xfs_da_brelse(trans, bp);
+                        XFS_CORRUPTION_ERROR("xfs_dir_node_getdents(1)",
+                                             XFS_ERRLEVEL_LOW, mp, leaf);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
+                if ((nextbno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))) {
+                        nextda = xfs_da_reada_buf(trans, dp, nextbno,
+                                                  XFS_DATA_FORK);
+                } else
+                        nextda = -1;
+                error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
+                                                  put, nextda);
+                xfs_da_brelse(trans, bp);
+                bno = nextbno;
+                if (eob) {
+                        xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
+                        *eofp = 0;
+                        return(error);
+                }
+                if (bno == 0)
+                        break;
+                error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
+                                        XFS_DATA_FORK);
+                if (error)
+                        return(error);
+                if (unlikely(bp == NULL)) {
+                        XFS_ERROR_REPORT("xfs_dir_node_getdents(2)",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        return(XFS_ERROR(EFSCORRUPTED));
+                }
+        }
+        *eofp = 1;
+        xfs_dir_trace_g_du("node: E-O-F", dp, uio);
+        return(0);
+}
+/*
+ * Look up a filename in an int directory, replace the inode number.
+ * Use an internal routine to actually do the lookup.
+ */
+STATIC int
+xfs_dir_node_replace(xfs_da_args_t *args)
+{
+        xfs_da_state_t *state;
+        xfs_da_state_blk_t *blk;
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_entry_t *entry;
+        xfs_dir_leaf_name_t *namest;
+        xfs_ino_t inum;
+        int retval, error, i;
+        xfs_dabuf_t *bp;
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_sb.sb_blocksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        inum = args->inumber;
+        /*
+         * Search to see if name exists,
+         * and get back a pointer to it.
+         */
+        error = xfs_da_node_lookup_int(state, &retval);
+        if (error) {
+                retval = error;
+        }
+        if (retval == EEXIST) {
+                blk = &state->path.blk[state->path.active - 1];
+                ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+                bp = blk->bp;
+                leaf = bp->data;
+                entry = &leaf->entries[blk->index];
+                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+                /* XXX - replace assert ? */
+                XFS_DIR_SF_PUT_DIRINO(&inum, &namest->inumber);
+                xfs_da_log_buf(args->trans, bp,
+                    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
+                xfs_da_buf_done(bp);
+                blk->bp = NULL;
+                retval = 0;
+        } else {
+                i = state->path.active - 1;
+                xfs_da_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        for (i = 0; i < state->path.active - 1; i++) {
+                xfs_da_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return(retval);
+}
+#if defined(XFS_DIR_TRACE)
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
+{
+        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
+                     (void *)dp, (void *)dp->i_mount,
+                     (void *)((unsigned long)(uio->uio_offset >> 32)),
+                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
+                     (void *)(unsigned long)uio->uio_resid,
+                     NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
+{
+        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
+                     (void *)dp, (void *)dp->i_mount,
+                     (void *)((unsigned long)(uio->uio_offset >> 32)),
+                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
+                     (void *)(unsigned long)uio->uio_resid,
+                     (void *)(unsigned long)bno,
+                     NULL, NULL, NULL, NULL, NULL, NULL);
+}
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
+                        xfs_da_intnode_t *node)
+{
+        int     last = INT_GET(node->hdr.count, ARCH_CONVERT) - 1;
+        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
+                     (void *)dp, (void *)dp->i_mount,
+                     (void *)((unsigned long)(uio->uio_offset >> 32)),
+                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
+                     (void *)(unsigned long)uio->uio_resid,
+                     (void *)(unsigned long)
+                        INT_GET(node->hdr.info.forw, ARCH_CONVERT),
+                     (void *)(unsigned long)
+                        INT_GET(node->hdr.count, ARCH_CONVERT),
+                     (void *)(unsigned long)
+                        INT_GET(node->btree[0].hashval, ARCH_CONVERT),
+                     (void *)(unsigned long)
+                        INT_GET(node->btree[last].hashval, ARCH_CONVERT),
+                     NULL, NULL, NULL);
+}
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
+                        xfs_dir_leafblock_t *leaf)
+{
+        int     last = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
+        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
+                     (void *)dp, (void *)dp->i_mount,
+                     (void *)((unsigned long)(uio->uio_offset >> 32)),
+                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
+                     (void *)(unsigned long)uio->uio_resid,
+                     (void *)(unsigned long)
+                        INT_GET(leaf->hdr.info.forw, ARCH_CONVERT),
+                     (void *)(unsigned long)
+                        INT_GET(leaf->hdr.count, ARCH_CONVERT),
+                     (void *)(unsigned long)
+                        INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
+                     (void *)(unsigned long)
+                        INT_GET(leaf->entries[last].hashval, ARCH_CONVERT),
+                     NULL, NULL, NULL);
+}
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
+                        xfs_dir_leaf_entry_t *entry)
+{
+        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
+                     (void *)dp, (void *)dp->i_mount,
+                     (void *)((unsigned long)(uio->uio_offset >> 32)),
+                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
+                     (void *)(unsigned long)uio->uio_resid,
+                     (void *)(unsigned long)
+                        INT_GET(entry->hashval, ARCH_CONVERT),
+                     NULL, NULL, NULL, NULL, NULL, NULL);
+}
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
+{
+        xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
+                     (void *)dp, (void *)dp->i_mount,
+                     (void *)((unsigned long)(uio->uio_offset >> 32)),
+                     (void *)((unsigned long)(uio->uio_offset & 0xFFFFFFFF)),
+                     (void *)(unsigned long)uio->uio_resid,
+                     (void *)((unsigned long)(cookie >> 32)),
+                     (void *)((unsigned long)(cookie & 0xFFFFFFFF)),
+                     NULL, NULL, NULL, NULL, NULL);
+}
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+void
+xfs_dir_trace_enter(int type, char *where,
+                        void * a0, void * a1,
+                        void * a2, void * a3,
+                        void * a4, void * a5,
+                        void * a6, void * a7,
+                        void * a8, void * a9,
+                        void * a10, void * a11)
+{
+        ASSERT(xfs_dir_trace_buf);
+        ktrace_enter(xfs_dir_trace_buf, (void *)(unsigned long)type,
+                                        (void *)where,
+                                        (void *)a0, (void *)a1, (void *)a2,
+                                        (void *)a3, (void *)a4, (void *)a5,
+                                        (void *)a6, (void *)a7, (void *)a8,
+                                        (void *)a9, (void *)a10, (void *)a11,
+                                        NULL, NULL);
+}
+#endif  /* XFS_DIR_TRACE */
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
new file mode 100644
index 000000000000..4dbc9f54cca5
--- /dev/null
+++ b/fs/xfs/xfs_dir.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_H__
+#define __XFS_DIR_H__
+/*
+ * Large directories are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Filenames are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of a filename may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Small directories use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+struct uio;
+struct xfs_bmap_free;
+struct xfs_da_args;
+struct xfs_dinode;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * Directory function types.
+ * Put in structures (xfs_dirops_t) for v1 and v2 directories.
+ */
+typedef void    (*xfs_dir_mount_t)(struct xfs_mount *mp);
+typedef int     (*xfs_dir_isempty_t)(struct xfs_inode *dp);
+typedef int     (*xfs_dir_init_t)(struct xfs_trans *tp,
+                                  struct xfs_inode *dp,
+                                  struct xfs_inode *pdp);
+typedef int     (*xfs_dir_createname_t)(struct xfs_trans *tp,
+                                        struct xfs_inode *dp,
+                                        char *name,
+                                        int namelen,
+                                        xfs_ino_t inum,
+                                        xfs_fsblock_t *first,
+                                        struct xfs_bmap_free *flist,
+                                        xfs_extlen_t total);
+typedef int     (*xfs_dir_lookup_t)(struct xfs_trans *tp,
+                                    struct xfs_inode *dp,
+                                    char *name,
+                                    int namelen,
+                                    xfs_ino_t *inum);
+typedef int     (*xfs_dir_removename_t)(struct xfs_trans *tp,
+                                        struct xfs_inode *dp,
+                                        char *name,
+                                        int namelen,
+                                        xfs_ino_t ino,
+                                        xfs_fsblock_t *first,
+                                        struct xfs_bmap_free *flist,
+                                        xfs_extlen_t total);
+typedef int     (*xfs_dir_getdents_t)(struct xfs_trans *tp,
+                                      struct xfs_inode *dp,
+                                      struct uio *uio,
+                                      int *eofp);
+typedef int     (*xfs_dir_replace_t)(struct xfs_trans *tp,
+                                     struct xfs_inode *dp,
+                                     char *name,
+                                     int namelen,
+                                     xfs_ino_t inum,
+                                     xfs_fsblock_t *first,
+                                     struct xfs_bmap_free *flist,
+                                     xfs_extlen_t total);
+typedef int     (*xfs_dir_canenter_t)(struct xfs_trans *tp,
+                                      struct xfs_inode *dp,
+                                      char *name,
+                                      int namelen);
+typedef int     (*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
+                                                       struct xfs_dinode *dip);
+typedef int     (*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
+typedef struct xfs_dirops {
+        xfs_dir_mount_t                         xd_mount;
+        xfs_dir_isempty_t                       xd_isempty;
+        xfs_dir_init_t                          xd_init;
+        xfs_dir_createname_t                    xd_createname;
+        xfs_dir_lookup_t                        xd_lookup;
+        xfs_dir_removename_t                    xd_removename;
+        xfs_dir_getdents_t                      xd_getdents;
+        xfs_dir_replace_t                       xd_replace;
+        xfs_dir_canenter_t                      xd_canenter;
+        xfs_dir_shortform_validate_ondisk_t     xd_shortform_validate_ondisk;
+        xfs_dir_shortform_to_single_t           xd_shortform_to_single;
+} xfs_dirops_t;
+/*
+ * Overall external interface routines.
+ */
+void    xfs_dir_startup(void);  /* called exactly once */
+#define XFS_DIR_MOUNT(mp)       \
+        ((mp)->m_dirops.xd_mount(mp))
+#define XFS_DIR_ISEMPTY(mp,dp)  \
+        ((mp)->m_dirops.xd_isempty(dp))
+#define XFS_DIR_INIT(mp,tp,dp,pdp)      \
+        ((mp)->m_dirops.xd_init(tp,dp,pdp))
+#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
+        ((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
+                                      total))
+#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum)      \
+        ((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
+#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \
+        ((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
+#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp)     \
+        ((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
+#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total)   \
+        ((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
+#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \
+        ((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
+#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip)       \
+        ((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
+#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args)    \
+        ((mp)->m_dirops.xd_shortform_to_single(args))
+#define XFS_DIR_IS_V1(mp)       ((mp)->m_dirversion == 1)
+extern xfs_dirops_t xfsv1_dirops;
+#endif  /* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
new file mode 100644
index 000000000000..49fc0a3695ae
--- /dev/null
+++ b/fs/xfs/xfs_dir2.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * XFS v2 directory implmentation.
+ * Top-level and utility routines.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+/*
+ * Declarations for interface routines.
+ */
+static void     xfs_dir2_mount(xfs_mount_t *mp);
+static int      xfs_dir2_isempty(xfs_inode_t *dp);
+static int      xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
+                              xfs_inode_t *pdp);
+static int      xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
+                                    char *name, int namelen, xfs_ino_t inum,
+                                    xfs_fsblock_t *first,
+                                    xfs_bmap_free_t *flist, xfs_extlen_t total);
+static int      xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+                                int namelen, xfs_ino_t *inum);
+static int      xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
+                                    char *name, int namelen, xfs_ino_t ino,
+                                    xfs_fsblock_t *first,
+                                    xfs_bmap_free_t *flist, xfs_extlen_t total);
+static int      xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
+                                  int *eofp);
+static int      xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+                                 int namelen, xfs_ino_t inum,
+                                 xfs_fsblock_t *first, xfs_bmap_free_t *flist,
+                                 xfs_extlen_t total);
+static int      xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+                                  int namelen);
+static int      xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
+                                                   xfs_dinode_t *dip);
+/*
+ * Utility routine declarations.
+ */
+static int      xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
+static int      xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
+/*
+ * Directory operations vector.
+ */
+xfs_dirops_t    xfsv2_dirops = {
+        .xd_mount                       = xfs_dir2_mount,
+        .xd_isempty                     = xfs_dir2_isempty,
+        .xd_init                        = xfs_dir2_init,
+        .xd_createname                  = xfs_dir2_createname,
+        .xd_lookup                      = xfs_dir2_lookup,
+        .xd_removename                  = xfs_dir2_removename,
+        .xd_getdents                    = xfs_dir2_getdents,
+        .xd_replace                     = xfs_dir2_replace,
+        .xd_canenter                    = xfs_dir2_canenter,
+        .xd_shortform_validate_ondisk   = xfs_dir2_shortform_validate_ondisk,
+        .xd_shortform_to_single         = xfs_dir2_sf_to_block,
+};
+/*
+ * Interface routines.
+ */
+/*
+ * Initialize directory-related fields in the mount structure.
+ */
+static void
+xfs_dir2_mount(
+        xfs_mount_t     *mp)            /* filesystem mount point */
+{
+        mp->m_dirversion = 2;
+        ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+               XFS_MAX_BLOCKSIZE);
+        mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
+        mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
+        mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp));
+        mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
+        mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp));
+        mp->m_attr_node_ents =
+                (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+                (uint)sizeof(xfs_da_node_entry_t);
+        mp->m_dir_node_ents =
+                (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+                (uint)sizeof(xfs_da_node_entry_t);
+        mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+}
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+static int                              /* return code */
+xfs_dir2_isempty(
+        xfs_inode_t     *dp)            /* incore inode structure */
+{
+        xfs_dir2_sf_t   *sfp;           /* shortform directory structure */
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        /*
+         * Might happen during shutdown.
+         */
+        if (dp->i_d.di_size == 0) {
+                return 1;
+        }
+        if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+                return 0;
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        return !sfp->hdr.count;
+}
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+static int                              /* error */
+xfs_dir2_init(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        xfs_inode_t     *pdp)           /* incore parent directory inode */
+{
+        xfs_da_args_t   args;           /* operation arguments */
+        int             error;          /* error return value */
+        memset((char *)&args, 0, sizeof(args));
+        args.dp = dp;
+        args.trans = tp;
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
+                return error;
+        }
+        return xfs_dir2_sf_create(&args, pdp->i_ino);
+}
+/*
+  Enter a name in a directory.
+ */
+static int                                      /* error */
+xfs_dir2_createname(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *dp,            /* incore directory inode */
+        char                    *name,          /* new entry name */
+        int                     namelen,        /* new entry name length */
+        xfs_ino_t               inum,           /* new entry inode number */
+        xfs_fsblock_t           *first,         /* bmap's firstblock */
+        xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
+        xfs_extlen_t            total)          /* bmap's total block count */
+{
+        xfs_da_args_t           args;           /* operation arguments */
+        int                     rval;           /* return value */
+        int                     v;              /* type-checking value */
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+                return rval;
+        }
+        XFS_STATS_INC(xs_dir_create);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = inum;
+        args.dp = dp;
+        args.firstblock = first;
+        args.flist = flist;
+        args.total = total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = tp;
+        args.justcheck = 0;
+        args.addname = args.oknoent = 1;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                rval = xfs_dir2_sf_addname(&args);
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_block_addname(&args);
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_leaf_addname(&args);
+        else
+                rval = xfs_dir2_node_addname(&args);
+        return rval;
+}
+/*
+ * Lookup a name in a directory, give back the inode number.
+ */
+static int                              /* error */
+xfs_dir2_lookup(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        char            *name,          /* lookup name */
+        int             namelen,        /* lookup name length */
+        xfs_ino_t       *inum)          /* out: inode number */
+{
+        xfs_da_args_t   args;           /* operation arguments */
+        int             rval;           /* return value */
+        int             v;              /* type-checking value */
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        XFS_STATS_INC(xs_dir_lookup);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = 0;
+        args.dp = dp;
+        args.firstblock = NULL;
+        args.flist = NULL;
+        args.total = 0;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = tp;
+        args.justcheck = args.addname = 0;
+        args.oknoent = 1;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                rval = xfs_dir2_sf_lookup(&args);
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_block_lookup(&args);
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_leaf_lookup(&args);
+        else
+                rval = xfs_dir2_node_lookup(&args);
+        if (rval == EEXIST)
+                rval = 0;
+        if (rval == 0)
+                *inum = args.inumber;
+        return rval;
+}
+/*
+ * Remove an entry from a directory.
+ */
+static int                              /* error */
+xfs_dir2_removename(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        char            *name,          /* name of entry to remove */
+        int             namelen,        /* name length of entry to remove */
+        xfs_ino_t       ino,            /* inode number of entry to remove */
+        xfs_fsblock_t   *first,         /* bmap's firstblock */
+        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+        xfs_extlen_t    total)          /* bmap's total block count */
+{
+        xfs_da_args_t   args;           /* operation arguments */
+        int             rval;           /* return value */
+        int             v;              /* type-checking value */
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        XFS_STATS_INC(xs_dir_remove);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = ino;
+        args.dp = dp;
+        args.firstblock = first;
+        args.flist = flist;
+        args.total = total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = tp;
+        args.justcheck = args.addname = args.oknoent = 0;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                rval = xfs_dir2_sf_removename(&args);
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_block_removename(&args);
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_leaf_removename(&args);
+        else
+                rval = xfs_dir2_node_removename(&args);
+        return rval;
+}
+/*
+ * Read a directory.
+ */
+static int                              /* error */
+xfs_dir2_getdents(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        uio_t           *uio,           /* caller's buffer control */
+        int             *eofp)          /* out: eof reached */
+{
+        int             alignment;      /* alignment required for ABI */
+        xfs_dirent_t    *dbp;           /* malloc'ed buffer */
+        xfs_dir2_put_t  put;            /* entry formatting routine */
+        int             rval;           /* return value */
+        int             v;              /* type-checking value */
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        XFS_STATS_INC(xs_dir_getdents);
+        /*
+         * If our caller has given us a single contiguous aligned memory buffer,
+         * just work directly within that buffer.  If it's in user memory,
+         * lock it down first.
+         */
+        alignment = sizeof(xfs_off_t) - 1;
+        if ((uio->uio_iovcnt == 1) &&
+            (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
+            ((uio->uio_iov[0].iov_len & alignment) == 0)) {
+                dbp = NULL;
+                put = xfs_dir2_put_dirent64_direct;
+        } else {
+                dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
+                put = xfs_dir2_put_dirent64_uio;
+        }
+        *eofp = 0;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+                ;
+        } else if (v)
+                rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
+        else
+                rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
+        if (dbp != NULL)
+                kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
+        return rval;
+}
+/*
+ * Replace the inode number of a directory entry.
+ */
+static int                              /* error */
+xfs_dir2_replace(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        char            *name,          /* name of entry to replace */
+        int             namelen,        /* name length of entry to replace */
+        xfs_ino_t       inum,           /* new inode number */
+        xfs_fsblock_t   *first,         /* bmap's firstblock */
+        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
+        xfs_extlen_t    total)          /* bmap's total block count */
+{
+        xfs_da_args_t   args;           /* operation arguments */
+        int             rval;           /* return value */
+        int             v;              /* type-checking value */
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+                return rval;
+        }
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = inum;
+        args.dp = dp;
+        args.firstblock = first;
+        args.flist = flist;
+        args.total = total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = tp;
+        args.justcheck = args.addname = args.oknoent = 0;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                rval = xfs_dir2_sf_replace(&args);
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_block_replace(&args);
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_leaf_replace(&args);
+        else
+                rval = xfs_dir2_node_replace(&args);
+        return rval;
+}
+/*
+ * See if this entry can be added to the directory without allocating space.
+ */
+static int                              /* error */
+xfs_dir2_canenter(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        char            *name,          /* name of entry to add */
+        int             namelen)        /* name length of entry to add */
+{
+        xfs_da_args_t   args;           /* operation arguments */
+        int             rval;           /* return value */
+        int             v;              /* type-checking value */
+        ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
+        /*
+         * Fill in the arg structure for this request.
+         */
+        args.name = name;
+        args.namelen = namelen;
+        args.hashval = xfs_da_hashname(name, namelen);
+        args.inumber = 0;
+        args.dp = dp;
+        args.firstblock = NULL;
+        args.flist = NULL;
+        args.total = 0;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = tp;
+        args.justcheck = args.addname = args.oknoent = 1;
+        /*
+         * Decide on what work routines to call based on the inode size.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+                rval = xfs_dir2_sf_addname(&args);
+        else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_block_addname(&args);
+        else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+                return rval;
+        } else if (v)
+                rval = xfs_dir2_leaf_addname(&args);
+        else
+                rval = xfs_dir2_node_addname(&args);
+        return rval;
+}
+/*
+ * Dummy routine for shortform inode validation.
+ * Can't really do this.
+ */
+/* ARGSUSED */
+static int                              /* error */
+xfs_dir2_shortform_validate_ondisk(
+        xfs_mount_t     *mp,            /* filesystem mount point */
+        xfs_dinode_t    *dip)           /* ondisk inode */
+{
+        return 0;
+}
+/*
+ * Utility routines.
+ */
+/*
+ * Add a block to the directory.
+ * This routine is for data and free blocks, not leaf/node blocks
+ * which are handled by xfs_da_grow_inode.
+ */
+int                                     /* error */
+xfs_dir2_grow_inode(
+        xfs_da_args_t   *args,          /* operation arguments */
+        int             space,          /* v2 dir's space XFS_DIR2_xxx_SPACE */
+        xfs_dir2_db_t   *dbp)           /* out: block number added */
+{
+        xfs_fileoff_t   bno;            /* directory offset of new block */
+        int             count;          /* count of filesystem blocks */
+        xfs_inode_t     *dp;            /* incore directory inode */
+        int             error;          /* error return value */
+        int             got;            /* blocks actually mapped */
+        int             i;              /* temp mapping index */
+        xfs_bmbt_irec_t map;            /* single structure for bmap */
+        int             mapi;           /* mapping index */
+        xfs_bmbt_irec_t *mapp;          /* bmap mapping structure(s) */
+        xfs_mount_t     *mp;            /* filesystem mount point */
+        int             nmap;           /* number of bmap entries */
+        xfs_trans_t     *tp;            /* transaction pointer */
+        xfs_dir2_trace_args_s("grow_inode", args, space);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        /*
+         * Set lowest possible block in the space requested.
+         */
+        bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+        count = mp->m_dirblkfsbs;
+        /*
+         * Find the first hole for our block.
+         */
+        if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
+                return error;
+        }
+        nmap = 1;
+        ASSERT(args->firstblock != NULL);
+        /*
+         * Try mapping the new block contiguously (one extent).
+         */
+        if ((error = xfs_bmapi(tp, dp, bno, count,
+                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+                        args->firstblock, args->total, &map, &nmap,
+                        args->flist))) {
+                return error;
+        }
+        ASSERT(nmap <= 1);
+        /*
+         * Got it in 1.
+         */
+        if (nmap == 1) {
+                mapp = &map;
+                mapi = 1;
+        }
+        /*
+         * Didn't work and this is a multiple-fsb directory block.
+         * Try again with contiguous flag turned on.
+         */
+        else if (nmap == 0 && count > 1) {
+                xfs_fileoff_t   b;      /* current file offset */
+                /*
+                 * Space for maximum number of mappings.
+                 */
+                mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+                /*
+                 * Iterate until we get to the end of our block.
+                 */
+                for (b = bno, mapi = 0; b < bno + count; ) {
+                        int     c;      /* current fsb count */
+                        /*
+                         * Can't map more than MAX_NMAP at once.
+                         */
+                        nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+                        c = (int)(bno + count - b);
+                        if ((error = xfs_bmapi(tp, dp, b, c,
+                                        XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
+                                        args->firstblock, args->total,
+                                        &mapp[mapi], &nmap, args->flist))) {
+                                kmem_free(mapp, sizeof(*mapp) * count);
+                                return error;
+                        }
+                        if (nmap < 1)
+                                break;
+                        /*
+                         * Add this bunch into our table, go to the next offset.
+                         */
+                        mapi += nmap;
+                        b = mapp[mapi - 1].br_startoff +
+                            mapp[mapi - 1].br_blockcount;
+                }
+        }
+        /*
+         * Didn't work.
+         */
+        else {
+                mapi = 0;
+                mapp = NULL;
+        }
+        /*
+         * See how many fsb's we got.
+         */
+        for (i = 0, got = 0; i < mapi; i++)
+                got += mapp[i].br_blockcount;
+        /*
+         * Didn't get enough fsb's, or the first/last block's are wrong.
+         */
+        if (got != count || mapp[0].br_startoff != bno ||
+            mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+            bno + count) {
+                if (mapp != &map)
+                        kmem_free(mapp, sizeof(*mapp) * count);
+                return XFS_ERROR(ENOSPC);
+        }
+        /*
+         * Done with the temporary mapping table.
+         */
+        if (mapp != &map)
+                kmem_free(mapp, sizeof(*mapp) * count);
+        *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno);
+        /*
+         * Update file's size if this is the data space and it grew.
+         */
+        if (space == XFS_DIR2_DATA_SPACE) {
+                xfs_fsize_t     size;           /* directory file (data) size */
+                size = XFS_FSB_TO_B(mp, bno + count);
+                if (size > dp->i_d.di_size) {
+                        dp->i_d.di_size = size;
+                        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+                }
+        }
+        return 0;
+}
+/*
+ * See if the directory is a single-block form directory.
+ */
+int                                     /* error */
+xfs_dir2_isblock(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        int             *vp)            /* out: 1 is block, 0 is not block */
+{
+        xfs_fileoff_t   last;           /* last file offset */
+        xfs_mount_t     *mp;            /* filesystem mount point */
+        int             rval;           /* return value */
+        mp = dp->i_mount;
+        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+                return rval;
+        }
+        rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
+        ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+        *vp = rval;
+        return 0;
+}
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int                                     /* error */
+xfs_dir2_isleaf(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_inode_t     *dp,            /* incore directory inode */
+        int             *vp)            /* out: 1 is leaf, 0 is not leaf */
+{
+        xfs_fileoff_t   last;           /* last file offset */
+        xfs_mount_t     *mp;            /* filesystem mount point */
+        int             rval;           /* return value */
+        mp = dp->i_mount;
+        if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+                return rval;
+        }
+        *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+        return 0;
+}
+/*
+ * Getdents put routine for 64-bit ABI, direct form.
+ */
+static int                                      /* error */
+xfs_dir2_put_dirent64_direct(
+        xfs_dir2_put_args_t     *pa)            /* argument bundle */
+{
+        xfs_dirent_t            *idbp;          /* dirent pointer */
+        iovec_t                 *iovp;          /* io vector */
+        int                     namelen;        /* entry name length */
+        int                     reclen;         /* entry total length */
+        uio_t                   *uio;           /* I/O control */
+        namelen = pa->namelen;
+        reclen = DIRENTSIZE(namelen);
+        uio = pa->uio;
+        /*
+         * Won't fit in the remaining space.
+         */
+        if (reclen > uio->uio_resid) {
+                pa->done = 0;
+                return 0;
+        }
+        iovp = uio->uio_iov;
+        idbp = (xfs_dirent_t *)iovp->iov_base;
+        iovp->iov_base = (char *)idbp + reclen;
+        iovp->iov_len -= reclen;
+        uio->uio_resid -= reclen;
+        idbp->d_reclen = reclen;
+        idbp->d_ino = pa->ino;
+        idbp->d_off = pa->cook;
+        idbp->d_name[namelen] = '\0';
+        pa->done = 1;
+        memcpy(idbp->d_name, pa->name, namelen);
+        return 0;
+}
+/*
+ * Getdents put routine for 64-bit ABI, uio form.
+ */
+static int                                      /* error */
+xfs_dir2_put_dirent64_uio(
+        xfs_dir2_put_args_t     *pa)            /* argument bundle */
+{
+        xfs_dirent_t            *idbp;          /* dirent pointer */
+        int                     namelen;        /* entry name length */
+        int                     reclen;         /* entry total length */
+        int                     rval;           /* return value */
+        uio_t                   *uio;           /* I/O control */
+        namelen = pa->namelen;
+        reclen = DIRENTSIZE(namelen);
+        uio = pa->uio;
+        /*
+         * Won't fit in the remaining space.
+         */
+        if (reclen > uio->uio_resid) {
+                pa->done = 0;
+                return 0;
+        }
+        idbp = pa->dbp;
+        idbp->d_reclen = reclen;
+        idbp->d_ino = pa->ino;
+        idbp->d_off = pa->cook;
+        idbp->d_name[namelen] = '\0';
+        memcpy(idbp->d_name, pa->name, namelen);
+        rval = uio_read((caddr_t)idbp, reclen, uio);
+        pa->done = (rval == 0);
+        return rval;
+}
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+        xfs_da_args_t   *args,          /* operation arguments */
+        xfs_dir2_db_t   db,             /* directory block number */
+        xfs_dabuf_t     *bp)            /* block's buffer */
+{
+        xfs_fileoff_t   bno;            /* directory file offset */
+        xfs_dablk_t     da;             /* directory file offset */
+        int             done;           /* bunmap is finished */
+        xfs_inode_t     *dp;            /* incore directory inode */
+        int             error;          /* error return value */
+        xfs_mount_t     *mp;            /* filesystem mount point */
+        xfs_trans_t     *tp;            /* transaction pointer */
+        xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        da = XFS_DIR2_DB_TO_DA(mp, db);
+        /*
+         * Unmap the fsblock(s).
+         */
+        if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+                        XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+                        &done))) {
+                /*
+                 * ENOSPC actually can happen if we're in a removename with
+                 * no space reservation, and the resulting block removal
+                 * would cause a bmap btree split or conversion from extents
+                 * to btree.  This can only happen for un-fragmented
+                 * directory blocks, since you need to be punching out
+                 * the middle of an extent.
+                 * In this case we need to leave the block in the file,
+                 * and not binval it.
+                 * So the block has to be in a consistent empty state
+                 * and appropriately logged.
+                 * We don't free up the buffer, the caller can tell it
+                 * hasn't happened since it got an error back.
+                 */
+                return error;
+        }
+        ASSERT(done);
+        /*
+         * Invalidate the buffer from the transaction.
+         */
+        xfs_da_binval(tp, bp);
+        /*
+         * If it's not a data block, we're done.
+         */
+        if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+                return 0;
+        /*
+         * If the block isn't the last one in the directory, we're done.
+         */
+        if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0))
+                return 0;
+        bno = da;
+        if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+                /*
+                 * This can't really happen unless there's kernel corruption.
+                 */
+                return error;
+        }
+        if (db == mp->m_dirdatablk)
+                ASSERT(bno == 0);
+        else
+                ASSERT(bno > 0);
+        /*
+         * Set the size to the new last block.
+         */
+        dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+        return 0;
+}
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
new file mode 100644
index 000000000000..8f4fc7f23bcd
--- /dev/null
+++ b/fs/xfs/xfs_dir2.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_put_args;
+struct xfs_inode;
+struct xfs_trans;
+/*
+ * Directory version 2.
+ * There are 4 possible formats:
+ *      shortform
+ *      single block - data with embedded leaf at the end
+ *      multiple data blocks, single leaf+freeindex block
+ *      data blocks, node&leaf blocks (btree), freeindex blocks
+ *
+ *      The shortform format is in xfs_dir2_sf.h.
+ *      The single block format is in xfs_dir2_block.h.
+ *      The data block format is in xfs_dir2_data.h.
+ *      The leaf and freeindex block formats are in xfs_dir2_leaf.h.
+ *      Node blocks are the same as the other version, in xfs_da_btree.h.
+ */
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef __uint16_t      xfs_dir2_data_off_t;
+#define NULLDATAOFF     0xffffU
+typedef uint            xfs_dir2_data_aoff_t;   /* argument form */
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef __uint32_t      xfs_dir2_db_t;
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t               xfs_dir2_off_t;
+/*
+ * For getdents, argument struct for put routines.
+ */
+typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa);
+typedef struct xfs_dir2_put_args {
+        xfs_off_t               cook;           /* cookie of (next) entry */
+        xfs_intino_t    ino;            /* inode number */
+        struct xfs_dirent       *dbp;           /* buffer pointer */
+        char            *name;          /* directory entry name */
+        int             namelen;        /* length of name */
+        int             done;           /* output: set if value was stored */
+        xfs_dir2_put_t  put;            /* put function ptr (i/o) */
+        struct uio      *uio;           /* uio control structure */
+} xfs_dir2_put_args_t;
+#define XFS_DIR_IS_V2(mp)       ((mp)->m_dirversion == 2)
+extern xfs_dirops_t     xfsv2_dirops;
+/*
+ * Other interfaces used by the rest of the dir v2 code.
+ */
+extern int
+        xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+                            xfs_dir2_db_t *dbp);
+extern int
+        xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+extern int
+        xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+extern int
+        xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+                              struct xfs_dabuf *bp);
+#endif  /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
new file mode 100644
index 000000000000..bc4c40fcd479
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.c
@@ -0,0 +1,1248 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_dir2_block.c
+ * XFS V2 directory implementation, single-block form.
+ * See xfs_dir2_block.h for the format.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_error.h"
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
+                                    int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
+                                     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+/*
+ * Add an entry to a block directory.
+ */
+int                                             /* error */
+xfs_dir2_block_addname(
+        xfs_da_args_t           *args)          /* directory op arguments */
+{
+        xfs_dir2_data_free_t    *bf;            /* bestfree table in block */
+        xfs_dir2_block_t        *block;         /* directory block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        xfs_dabuf_t             *bp;            /* buffer for block */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        int                     compact;        /* need to compact leaf ents */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* block unused entry */
+        int                     error;          /* error return value */
+        xfs_dir2_data_unused_t  *enddup=NULL;   /* unused at end of data */
+        xfs_dahash_t            hash;           /* hash value of found entry */
+        int                     high;           /* high index for binary srch */
+        int                     highstale;      /* high stale index */
+        int                     lfloghigh=0;    /* last final leaf to log */
+        int                     lfloglow=0;     /* first final leaf to log */
+        int                     len;            /* length of the new entry */
+        int                     low;            /* low index for binary srch */
+        int                     lowstale;       /* low stale index */
+        int                     mid=0;          /* midpoint for binary srch */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log header */
+        int                     needscan;       /* need to rescan freespace */
+        xfs_dir2_data_off_t     *tagp;          /* pointer to tag value */
+        xfs_trans_t             *tp;            /* transaction structure */
+        xfs_dir2_trace_args("block_addname", args);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        /*
+         * Read the (one and only) directory block into dabuf bp.
+         */
+        if ((error =
+            xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+                return error;
+        }
+        ASSERT(bp != NULL);
+        block = bp->data;
+        /*
+         * Check the magic number, corrupted if wrong.
+         */
+        if (unlikely(INT_GET(block->hdr.magic, ARCH_CONVERT)
+                                                != XFS_DIR2_BLOCK_MAGIC)) {
+                XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
+                                     XFS_ERRLEVEL_LOW, mp, block);
+                xfs_da_brelse(tp, bp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        len = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+        /*
+         * Set up pointers to parts of the block.
+         */
+        bf = block->hdr.bestfree;
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        /*
+         * No stale entries?  Need space for entry and new leaf.
+         */
+        if (!btp->stale) {
+                /*
+                 * Tag just before the first leaf entry.
+                 */
+                tagp = (xfs_dir2_data_off_t *)blp - 1;
+                /*
+                 * Data object just before the first leaf entry.
+                 */
+                enddup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+                /*
+                 * If it's not free then can't do this add without cleaning up:
+                 * the space before the first leaf entry needs to be free so it
+                 * can be expanded to hold the pointer to the new entry.
+                 */
+                if (INT_GET(enddup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+                        dup = enddup = NULL;
+                /*
+                 * Check out the biggest freespace and see if it's the same one.
+                 */
+                else {
+                        dup = (xfs_dir2_data_unused_t *)
+                              ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
+                        if (dup == enddup) {
+                                /*
+                                 * It is the biggest freespace, is it too small
+                                 * to hold the new leaf too?
+                                 */
+                                if (INT_GET(dup->length, ARCH_CONVERT) < len + (uint)sizeof(*blp)) {
+                                        /*
+                                         * Yes, we use the second-largest
+                                         * entry instead if it works.
+                                         */
+                                        if (INT_GET(bf[1].length, ARCH_CONVERT) >= len)
+                                                dup = (xfs_dir2_data_unused_t *)
+                                                      ((char *)block +
+                                                       INT_GET(bf[1].offset, ARCH_CONVERT));
+                                        else
+                                                dup = NULL;
+                                }
+                        } else {
+                                /*
+                                 * Not the same free entry,
+                                 * just check its length.
+                                 */
+                                if (INT_GET(dup->length, ARCH_CONVERT) < len) {
+                                        dup = NULL;
+                                }
+                        }
+                }
+                compact = 0;
+        }
+        /*
+         * If there are stale entries we'll use one for the leaf.
+         * Is the biggest entry enough to avoid compaction?
+         */
+        else if (INT_GET(bf[0].length, ARCH_CONVERT) >= len) {
+                dup = (xfs_dir2_data_unused_t *)
+                      ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
+                compact = 0;
+        }
+        /*
+         * Will need to compact to make this work.
+         */
+        else {
+                /*
+                 * Tag just before the first leaf entry.
+                 */
+                tagp = (xfs_dir2_data_off_t *)blp - 1;
+                /*
+                 * Data object just before the first leaf entry.
+                 */
+                dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+                /*
+                 * If it's not free then the data will go where the
+                 * leaf data starts now, if it works at all.
+                 */
+                if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+                        if (INT_GET(dup->length, ARCH_CONVERT) + (INT_GET(btp->stale, ARCH_CONVERT) - 1) *
+                            (uint)sizeof(*blp) < len)
+                                dup = NULL;
+                } else if ((INT_GET(btp->stale, ARCH_CONVERT) - 1) * (uint)sizeof(*blp) < len)
+                        dup = NULL;
+                else
+                        dup = (xfs_dir2_data_unused_t *)blp;
+                compact = 1;
+        }
+        /*
+         * If this isn't a real add, we're done with the buffer.
+         */
+        if (args->justcheck)
+                xfs_da_brelse(tp, bp);
+        /*
+         * If we don't have space for the new entry & leaf ...
+         */
+        if (!dup) {
+                /*
+                 * Not trying to actually do anything, or don't have
+                 * a space reservation: return no-space.
+                 */
+                if (args->justcheck || args->total == 0)
+                        return XFS_ERROR(ENOSPC);
+                /*
+                 * Convert to the next larger format.
+                 * Then add the new entry in that format.
+                 */
+                error = xfs_dir2_block_to_leaf(args, bp);
+                xfs_da_buf_done(bp);
+                if (error)
+                        return error;
+                return xfs_dir2_leaf_addname(args);
+        }
+        /*
+         * Just checking, and it would work, so say so.
+         */
+        if (args->justcheck)
+                return 0;
+        needlog = needscan = 0;
+        /*
+         * If need to compact the leaf entries, do it now.
+         * Leave the highest-numbered stale entry stale.
+         * XXX should be the one closest to mid but mid is not yet computed.
+         */
+        if (compact) {
+                int     fromidx;                /* source leaf index */
+                int     toidx;                  /* target leaf index */
+                for (fromidx = toidx = INT_GET(btp->count, ARCH_CONVERT) - 1,
+                        highstale = lfloghigh = -1;
+                     fromidx >= 0;
+                     fromidx--) {
+                        if (INT_GET(blp[fromidx].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
+                                if (highstale == -1)
+                                        highstale = toidx;
+                                else {
+                                        if (lfloghigh == -1)
+                                                lfloghigh = toidx;
+                                        continue;
+                                }
+                        }
+                        if (fromidx < toidx)
+                                blp[toidx] = blp[fromidx];
+                        toidx--;
+                }
+                lfloglow = toidx + 1 - (INT_GET(btp->stale, ARCH_CONVERT) - 1);
+                lfloghigh -= INT_GET(btp->stale, ARCH_CONVERT) - 1;
+                INT_MOD(btp->count, ARCH_CONVERT, -(INT_GET(btp->stale, ARCH_CONVERT) - 1));
+                xfs_dir2_data_make_free(tp, bp,
+                        (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+                        (xfs_dir2_data_aoff_t)((INT_GET(btp->stale, ARCH_CONVERT) - 1) * sizeof(*blp)),
+                        &needlog, &needscan);
+                blp += INT_GET(btp->stale, ARCH_CONVERT) - 1;
+                INT_SET(btp->stale, ARCH_CONVERT, 1);
+                /*
+                 * If we now need to rebuild the bestfree map, do so.
+                 * This needs to happen before the next call to use_free.
+                 */
+                if (needscan) {
+                        xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+                                &needlog, NULL);
+                        needscan = 0;
+                }
+        }
+        /*
+         * Set leaf logging boundaries to impossible state.
+         * For the no-stale case they're set explicitly.
+         */
+        else if (INT_GET(btp->stale, ARCH_CONVERT)) {
+                lfloglow = INT_GET(btp->count, ARCH_CONVERT);
+                lfloghigh = -1;
+        }
+        /*
+         * Find the slot that's first lower than our hash value, -1 if none.
+         */
+        for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; low <= high; ) {
+                mid = (low + high) >> 1;
+                if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+                        break;
+                if (hash < args->hashval)
+                        low = mid + 1;
+                else
+                        high = mid - 1;
+        }
+        while (mid >= 0 && INT_GET(blp[mid].hashval, ARCH_CONVERT) >= args->hashval) {
+                mid--;
+        }
+        /*
+         * No stale entries, will use enddup space to hold new leaf.
+         */
+        if (!btp->stale) {
+                /*
+                 * Mark the space needed for the new leaf entry, now in use.
+                 */
+                xfs_dir2_data_use_free(tp, bp, enddup,
+                        (xfs_dir2_data_aoff_t)
+                        ((char *)enddup - (char *)block + INT_GET(enddup->length, ARCH_CONVERT) -
+                         sizeof(*blp)),
+                        (xfs_dir2_data_aoff_t)sizeof(*blp),
+                        &needlog, &needscan);
+                /*
+                 * Update the tail (entry count).
+                 */
+                INT_MOD(btp->count, ARCH_CONVERT, +1);
+                /*
+                 * If we now need to rebuild the bestfree map, do so.
+                 * This needs to happen before the next call to use_free.
+                 */
+                if (needscan) {
+                        xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+                                &needlog, NULL);
+                        needscan = 0;
+                }
+                /*
+                 * Adjust pointer to the first leaf entry, we're about to move
+                 * the table up one to open up space for the new leaf entry.
+                 * Then adjust our index to match.
+                 */
+                blp--;
+                mid++;
+                if (mid)
+                        memmove(blp, &blp[1], mid * sizeof(*blp));
+                lfloglow = 0;
+                lfloghigh = mid;
+        }
+        /*
+         * Use a stale leaf for our new entry.
+         */
+        else {
+                for (lowstale = mid;
+                     lowstale >= 0 &&
+                        INT_GET(blp[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
+                     lowstale--)
+                        continue;
+                for (highstale = mid + 1;
+                     highstale < INT_GET(btp->count, ARCH_CONVERT) &&
+                        INT_GET(blp[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
+                        (lowstale < 0 || mid - lowstale > highstale - mid);
+                     highstale++)
+                        continue;
+                /*
+                 * Move entries toward the low-numbered stale entry.
+                 */
+                if (lowstale >= 0 &&
+                    (highstale == INT_GET(btp->count, ARCH_CONVERT) ||
+                     mid - lowstale <= highstale - mid)) {
+                        if (mid - lowstale)
+                                memmove(&blp[lowstale], &blp[lowstale + 1],
+                                        (mid - lowstale) * sizeof(*blp));
+                        lfloglow = MIN(lowstale, lfloglow);
+                        lfloghigh = MAX(mid, lfloghigh);
+                }
+                /*
+                 * Move entries toward the high-numbered stale entry.
+                 */
+                else {
+                        ASSERT(highstale < INT_GET(btp->count, ARCH_CONVERT));
+                        mid++;
+                        if (highstale - mid)
+                                memmove(&blp[mid + 1], &blp[mid],
+                                        (highstale - mid) * sizeof(*blp));
+                        lfloglow = MIN(mid, lfloglow);
+                        lfloghigh = MAX(highstale, lfloghigh);
+                }
+                INT_MOD(btp->stale, ARCH_CONVERT, -1);
+        }
+        /*
+         * Point to the new data entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)dup;
+        /*
+         * Fill in the leaf entry.
+         */
+        INT_SET(blp[mid].hashval, ARCH_CONVERT, args->hashval);
+        INT_SET(blp[mid].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+        xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+        /*
+         * Mark space for the data entry used.
+         */
+        xfs_dir2_data_use_free(tp, bp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+                (xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+        /*
+         * Create the new data entry.
+         */
+        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->namelen = args->namelen;
+        memcpy(dep->name, args->name, args->namelen);
+        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+        INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+        /*
+         * Clean up the bestfree array and log the header, tail, and entry.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+                        NULL);
+        if (needlog)
+                xfs_dir2_data_log_header(tp, bp);
+        xfs_dir2_block_log_tail(tp, bp);
+        xfs_dir2_data_log_entry(tp, bp, dep);
+        xfs_dir2_data_check(dp, bp);
+        xfs_da_buf_done(bp);
+        return 0;
+}
+/*
+ * Readdir for block directories.
+ */
+int                                             /* error */
+xfs_dir2_block_getdents(
+        xfs_trans_t             *tp,            /* transaction (NULL) */
+        xfs_inode_t             *dp,            /* incore inode */
+        uio_t                   *uio,           /* caller's buffer control */
+        int                     *eofp,          /* eof reached? (out) */
+        xfs_dirent_t            *dbp,           /* caller's buffer */
+        xfs_dir2_put_t          put)            /* abi's formatting function */
+{
+        xfs_dir2_block_t        *block;         /* directory block structure */
+        xfs_dabuf_t             *bp;            /* buffer for block */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_dir2_data_unused_t  *dup;           /* block unused entry */
+        char                    *endptr;        /* end of the data entries */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_put_args_t     p;              /* arg package for put rtn */
+        char                    *ptr;           /* current data entry */
+        int                     wantoff;        /* starting block offset */
+        mp = dp->i_mount;
+        /*
+         * If the block number in the offset is out of range, we're done.
+         */
+        if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) {
+                *eofp = 1;
+                return 0;
+        }
+        /*
+         * Can't read the block, give up, else get dabuf in bp.
+         */
+        if ((error =
+            xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+                return error;
+        }
+        ASSERT(bp != NULL);
+        /*
+         * Extract the byte offset we start at from the seek pointer.
+         * We'll skip entries before this.
+         */
+        wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset);
+        block = bp->data;
+        xfs_dir2_data_check(dp, bp);
+        /*
+         * Set up values for the loop.
+         */
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        ptr = (char *)block->u;
+        endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
+        p.dbp = dbp;
+        p.put = put;
+        p.uio = uio;
+        /*
+         * Loop over the data portion of the block.
+         * Each object is a real entry (dep) or an unused one (dup).
+         */
+        while (ptr < endptr) {
+                dup = (xfs_dir2_data_unused_t *)ptr;
+                /*
+                 * Unused, skip it.
+                 */
+                if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+                        ptr += INT_GET(dup->length, ARCH_CONVERT);
+                        continue;
+                }
+                dep = (xfs_dir2_data_entry_t *)ptr;
+                /*
+                 * Bump pointer for the next iteration.
+                 */
+                ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+                /*
+                 * The entry is before the desired starting point, skip it.
+                 */
+                if ((char *)dep - (char *)block < wantoff)
+                        continue;
+                /*
+                 * Set up argument structure for put routine.
+                 */
+                p.namelen = dep->namelen;
+                p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                                    ptr - (char *)block);
+                p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+#if XFS_BIG_INUMS
+                p.ino += mp->m_inoadd;
+#endif
+                p.name = (char *)dep->name;
+                /*
+                 * Put the entry in the caller's buffer.
+                 */
+                error = p.put(&p);
+                /*
+                 * If it didn't fit, set the final offset to here & return.
+                 */
+                if (!p.done) {
+                        uio->uio_offset =
+                                XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                        (char *)dep - (char *)block);
+                        xfs_da_brelse(tp, bp);
+                        return error;
+                }
+        }
+        /*
+         * Reached the end of the block.
+         * Set the offset to a nonexistent block 1 and return.
+         */
+        *eofp = 1;
+        uio->uio_offset =
+                XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
+        xfs_da_brelse(tp, bp);
+        return 0;
+}
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+        xfs_trans_t             *tp,            /* transaction structure */
+        xfs_dabuf_t             *bp,            /* block buffer */
+        int                     first,          /* index of first logged leaf */
+        int                     last)           /* index of last logged leaf */
+{
+        xfs_dir2_block_t        *block;         /* directory block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        mp = tp->t_mountp;
+        block = bp->data;
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
+                (uint)((char *)&blp[last + 1] - (char *)block - 1));
+}
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+        xfs_trans_t             *tp,            /* transaction structure */
+        xfs_dabuf_t             *bp)            /* block buffer */
+{
+        xfs_dir2_block_t        *block;         /* directory block structure */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        mp = tp->t_mountp;
+        block = bp->data;
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
+                (uint)((char *)(btp + 1) - (char *)block - 1));
+}
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int                                             /* error */
+xfs_dir2_block_lookup(
+        xfs_da_args_t           *args)          /* dir lookup arguments */
+{
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        xfs_dabuf_t             *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     ent;            /* entry index */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_trace_args("block_lookup", args);
+        /*
+         * Get the buffer, look up the entry.
+         * If not found (ENOENT) then return, have no buffer.
+         */
+        if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+                return error;
+        dp = args->dp;
+        mp = dp->i_mount;
+        block = bp->data;
+        xfs_dir2_data_check(dp, bp);
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        /*
+         * Get the offset from the leaf entry, to point to the data.
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+        /*
+         * Fill in inode number, release the block.
+         */
+        args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+        xfs_da_brelse(args->trans, bp);
+        return XFS_ERROR(EEXIST);
+}
+/*
+ * Internal block lookup routine.
+ */
+static int                                      /* error */
+xfs_dir2_block_lookup_int(
+        xfs_da_args_t           *args,          /* dir lookup arguments */
+        xfs_dabuf_t             **bpp,          /* returned block buffer */
+        int                     *entno)         /* returned entry number */
+{
+        xfs_dir2_dataptr_t      addr;           /* data entry address */
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        xfs_dabuf_t             *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     error;          /* error return value */
+        xfs_dahash_t            hash;           /* found hash value */
+        int                     high;           /* binary search high index */
+        int                     low;            /* binary search low index */
+        int                     mid;            /* binary search current idx */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        /*
+         * Read the buffer, return error if we can't get it.
+         */
+        if ((error =
+            xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+                return error;
+        }
+        ASSERT(bp != NULL);
+        block = bp->data;
+        xfs_dir2_data_check(dp, bp);
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        /*
+         * Loop doing a binary search for our hash value.
+         * Find our entry, ENOENT if it's not there.
+         */
+        for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; ; ) {
+                ASSERT(low <= high);
+                mid = (low + high) >> 1;
+                if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+                        break;
+                if (hash < args->hashval)
+                        low = mid + 1;
+                else
+                        high = mid - 1;
+                if (low > high) {
+                        ASSERT(args->oknoent);
+                        xfs_da_brelse(tp, bp);
+                        return XFS_ERROR(ENOENT);
+                }
+        }
+        /*
+         * Back up to the first one with the right hash value.
+         */
+        while (mid > 0 && INT_GET(blp[mid - 1].hashval, ARCH_CONVERT) == args->hashval) {
+                mid--;
+        }
+        /*
+         * Now loop forward through all the entries with the
+         * right hash value looking for our name.
+         */
+        do {
+                if ((addr = INT_GET(blp[mid].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Get pointer to the entry from the leaf.
+                 */
+                dep = (xfs_dir2_data_entry_t *)
+                        ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+                /*
+                 * Compare, if it's right give back buffer & entry number.
+                 */
+                if (dep->namelen == args->namelen &&
+                    dep->name[0] == args->name[0] &&
+                    memcmp(dep->name, args->name, args->namelen) == 0) {
+                        *bpp = bp;
+                        *entno = mid;
+                        return 0;
+                }
+        } while (++mid < INT_GET(btp->count, ARCH_CONVERT) && INT_GET(blp[mid].hashval, ARCH_CONVERT) == hash);
+        /*
+         * No match, release the buffer and return ENOENT.
+         */
+        ASSERT(args->oknoent);
+        xfs_da_brelse(tp, bp);
+        return XFS_ERROR(ENOENT);
+}
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int                                             /* error */
+xfs_dir2_block_removename(
+        xfs_da_args_t           *args)          /* directory operation args */
+{
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf pointer */
+        xfs_dabuf_t             *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     ent;            /* block leaf entry index */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log block header */
+        int                     needscan;       /* need to fixup bestfree */
+        xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
+        int                     size;           /* shortform size */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args("block_removename", args);
+        /*
+         * Look up the entry in the block.  Gets the buffer and entry index.
+         * It will always be there, the vnodeops level does a lookup first.
+         */
+        if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+                return error;
+        }
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        block = bp->data;
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        /*
+         * Point to the data entry using the leaf entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+        /*
+         * Mark the data entry's space free.
+         */
+        needlog = needscan = 0;
+        xfs_dir2_data_make_free(tp, bp,
+                (xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
+                XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+        /*
+         * Fix up the block tail.
+         */
+        INT_MOD(btp->stale, ARCH_CONVERT, +1);
+        xfs_dir2_block_log_tail(tp, bp);
+        /*
+         * Remove the leaf entry by marking it stale.
+         */
+        INT_SET(blp[ent].address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+        xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+        /*
+         * Fix up bestfree, log the header if necessary.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+                        NULL);
+        if (needlog)
+                xfs_dir2_data_log_header(tp, bp);
+        xfs_dir2_data_check(dp, bp);
+        /*
+         * See if the size as a shortform is good enough.
+         */
+        if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+            XFS_IFORK_DSIZE(dp)) {
+                xfs_da_buf_done(bp);
+                return 0;
+        }
+        /*
+         * If it works, do the conversion.
+         */
+        return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int                                             /* error */
+xfs_dir2_block_replace(
+        xfs_da_args_t           *args)          /* directory operation args */
+{
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        xfs_dabuf_t             *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* block data entry */
+        xfs_inode_t             *dp;            /* incore inode */
+        int                     ent;            /* leaf entry index */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_trace_args("block_replace", args);
+        /*
+         * Lookup the entry in the directory.  Get buffer and entry index.
+         * This will always succeed since the caller has already done a lookup.
+         */
+        if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+                return error;
+        }
+        dp = args->dp;
+        mp = dp->i_mount;
+        block = bp->data;
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        /*
+         * Point to the data entry we need to change.
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+        ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
+        /*
+         * Change the inode number to the new value.
+         */
+        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        xfs_dir2_data_log_entry(args->trans, bp, dep);
+        xfs_dir2_data_check(dp, bp);
+        xfs_da_buf_done(bp);
+        return 0;
+}
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int                                      /* sort order */
+xfs_dir2_block_sort(
+        const void                      *a,     /* first leaf entry */
+        const void                      *b)     /* second leaf entry */
+{
+        const xfs_dir2_leaf_entry_t     *la;    /* first leaf entry */
+        const xfs_dir2_leaf_entry_t     *lb;    /* second leaf entry */
+        la = a;
+        lb = b;
+        return INT_GET(la->hashval, ARCH_CONVERT) < INT_GET(lb->hashval, ARCH_CONVERT) ? -1 :
+                (INT_GET(la->hashval, ARCH_CONVERT) > INT_GET(lb->hashval, ARCH_CONVERT) ? 1 : 0);
+}
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int                                             /* error */
+xfs_dir2_leaf_to_block(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             *lbp,           /* leaf buffer */
+        xfs_dabuf_t             *dbp)           /* data buffer */
+{
+        xfs_dir2_data_off_t     *bestsp;        /* leaf bests table */
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* unused data entry */
+        int                     error;          /* error return value */
+        int                     from;           /* leaf from index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* file system mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to scan for bestfree */
+        xfs_dir2_sf_hdr_t       sfh;            /* shortform header */
+        int                     size;           /* bytes used */
+        xfs_dir2_data_off_t     *tagp;          /* end of entry (tag) */
+        int                     to;             /* block/leaf to index */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = lbp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        /*
+         * If there are data blocks other than the first one, take this
+         * opportunity to remove trailing empty data blocks that may have
+         * been left behind during no-space-reservation operations.
+         * These will show up in the leaf bests table.
+         */
+        while (dp->i_d.di_size > mp->m_dirblksize) {
+                bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
+                if (INT_GET(bestsp[INT_GET(ltp->bestcount, ARCH_CONVERT) - 1], ARCH_CONVERT) ==
+                    mp->m_dirblksize - (uint)sizeof(block->hdr)) {
+                        if ((error =
+                            xfs_dir2_leaf_trim_data(args, lbp,
+                                    (xfs_dir2_db_t)(INT_GET(ltp->bestcount, ARCH_CONVERT) - 1))))
+                                goto out;
+                } else {
+                        error = 0;
+                        goto out;
+                }
+        }
+        /*
+         * Read the data block if we don't already have it, give up if it fails.
+         */
+        if (dbp == NULL &&
+            (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+                    XFS_DATA_FORK))) {
+                goto out;
+        }
+        block = dbp->data;
+        ASSERT(INT_GET(block->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+        /*
+         * Size of the "leaf" area in the block.
+         */
+        size = (uint)sizeof(block->tail) +
+               (uint)sizeof(*lep) * (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+        /*
+         * Look at the last data entry.
+         */
+        tagp = (xfs_dir2_data_off_t *)((char *)block + mp->m_dirblksize) - 1;
+        dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+        /*
+         * If it's not free or is too short we can't do it.
+         */
+        if (INT_GET(dup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG || INT_GET(dup->length, ARCH_CONVERT) < size) {
+                error = 0;
+                goto out;
+        }
+        /*
+         * Start converting it to block form.
+         */
+        INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+        needlog = 1;
+        needscan = 0;
+        /*
+         * Use up the space at the end of the block (blp/btp).
+         */
+        xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+                &needlog, &needscan);
+        /*
+         * Initialize the block tail.
+         */
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        INT_SET(btp->count, ARCH_CONVERT, INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+        btp->stale = 0;
+        xfs_dir2_block_log_tail(tp, dbp);
+        /*
+         * Initialize the block leaf area.  We compact out stale entries.
+         */
+        lep = XFS_DIR2_BLOCK_LEAF_P(btp);
+        for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+                if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                lep[to++] = leaf->ents[from];
+        }
+        ASSERT(to == INT_GET(btp->count, ARCH_CONVERT));
+        xfs_dir2_block_log_leaf(tp, dbp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+        /*
+         * Scan the bestfree if we need it and log the data block header.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+                        NULL);
+        if (needlog)
+                xfs_dir2_data_log_header(tp, dbp);
+        /*
+         * Pitch the old leaf block.
+         */
+        error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+        lbp = NULL;
+        if (error) {
+                goto out;
+        }
+        /*
+         * Now see if the resulting block can be shrunken to shortform.
+         */
+        if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+            XFS_IFORK_DSIZE(dp)) {
+                error = 0;
+                goto out;
+        }
+        return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+out:
+        if (lbp)
+                xfs_da_buf_done(lbp);
+        if (dbp)
+                xfs_da_buf_done(dbp);
+        return error;
+}
+/*
+ * Convert the shortform directory to block form.
+ */
+int                                             /* error */
+xfs_dir2_sf_to_block(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_dir2_db_t           blkno;          /* dir-relative block # (0) */
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
+        xfs_dabuf_t             *bp;            /* block buffer */
+        xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
+        char                    *buf;           /* sf buffer */
+        int                     buf_len;
+        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     dummy;          /* trash */
+        xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
+        int                     endoffset;      /* end of data objects */
+        int                     error;          /* error return value */
+        int                     i;              /* index */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log block header */
+        int                     needscan;       /* need to scan block freespc */
+        int                     newoffset;      /* offset from current entry */
+        int                     offset;         /* target block offset */
+        xfs_dir2_sf_entry_t     *sfep;          /* sf entry pointer */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        xfs_dir2_data_off_t     *tagp;          /* end of data entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args("sf_to_block", args);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Bomb out if the shortform directory is way too short.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+        /*
+         * Copy the directory into the stack buffer.
+         * Then pitch the incore inode data so we can make extents.
+         */
+        buf_len = dp->i_df.if_bytes;
+        buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+        memcpy(buf, sfp, dp->i_df.if_bytes);
+        xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+        dp->i_d.di_size = 0;
+        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+        /*
+         * Reset pointer - old sfp is gone.
+         */
+        sfp = (xfs_dir2_sf_t *)buf;
+        /*
+         * Add block 0 to the inode.
+         */
+        error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+        if (error) {
+                kmem_free(buf, buf_len);
+                return error;
+        }
+        /*
+         * Initialize the data block.
+         */
+        error = xfs_dir2_data_init(args, blkno, &bp);
+        if (error) {
+                kmem_free(buf, buf_len);
+                return error;
+        }
+        block = bp->data;
+        INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+        /*
+         * Compute size of block "tail" area.
+         */
+        i = (uint)sizeof(*btp) +
+            (INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+        /*
+         * The whole thing is initialized to free by the init routine.
+         * Say we're using the leaf and tail area.
+         */
+        dup = (xfs_dir2_data_unused_t *)block->u;
+        needlog = needscan = 0;
+        xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
+                &needscan);
+        ASSERT(needscan == 0);
+        /*
+         * Fill in the tail.
+         */
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        INT_SET(btp->count, ARCH_CONVERT, INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2);   /* ., .. */
+        btp->stale = 0;
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        endoffset = (uint)((char *)blp - (char *)block);
+        /*
+         * Remove the freespace, we'll manage it.
+         */
+        xfs_dir2_data_use_free(tp, bp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+                INT_GET(dup->length, ARCH_CONVERT), &needlog, &needscan);
+        /*
+         * Create entry for .
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
+        INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
+        dep->namelen = 1;
+        dep->name[0] = '.';
+        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+        INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+        xfs_dir2_data_log_entry(tp, bp, dep);
+        INT_SET(blp[0].hashval, ARCH_CONVERT, xfs_dir_hash_dot);
+        INT_SET(blp[0].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+        /*
+         * Create entry for ..
+         */
+        dep = (xfs_dir2_data_entry_t *)
+                ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
+        INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
+        dep->namelen = 2;
+        dep->name[0] = dep->name[1] = '.';
+        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+        INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+        xfs_dir2_data_log_entry(tp, bp, dep);
+        INT_SET(blp[1].hashval, ARCH_CONVERT, xfs_dir_hash_dotdot);
+        INT_SET(blp[1].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+        offset = XFS_DIR2_DATA_FIRST_OFFSET;
+        /*
+         * Loop over existing entries, stuff them in.
+         */
+        if ((i = 0) == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+                sfep = NULL;
+        else
+                sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+        /*
+         * Need to preserve the existing offset values in the sf directory.
+         * Insert holes (unused entries) where necessary.
+         */
+        while (offset < endoffset) {
+                /*
+                 * sfep is null when we reach the end of the list.
+                 */
+                if (sfep == NULL)
+                        newoffset = endoffset;
+                else
+                        newoffset = XFS_DIR2_SF_GET_OFFSET(sfep);
+                /*
+                 * There should be a hole here, make one.
+                 */
+                if (offset < newoffset) {
+                        dup = (xfs_dir2_data_unused_t *)
+                              ((char *)block + offset);
+                        INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+                        INT_SET(dup->length, ARCH_CONVERT, newoffset - offset);
+                        INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT,
+                                (xfs_dir2_data_off_t)
+                                ((char *)dup - (char *)block));
+                        xfs_dir2_data_log_unused(tp, bp, dup);
+                        (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
+                                dup, &dummy);
+                        offset += INT_GET(dup->length, ARCH_CONVERT);
+                        continue;
+                }
+                /*
+                 * Copy a real entry.
+                 */
+                dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
+                INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER(sfp,
+                                XFS_DIR2_SF_INUMBERP(sfep)));
+                dep->namelen = sfep->namelen;
+                memcpy(dep->name, sfep->name, dep->namelen);
+                tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+                INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+                xfs_dir2_data_log_entry(tp, bp, dep);
+                INT_SET(blp[2 + i].hashval, ARCH_CONVERT, xfs_da_hashname((char *)sfep->name, sfep->namelen));
+                INT_SET(blp[2 + i].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp,
+                                                 (char *)dep - (char *)block));
+                offset = (int)((char *)(tagp + 1) - (char *)block);
+                if (++i == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+                        sfep = NULL;
+                else
+                        sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+        }
+        /* Done with the temporary buffer */
+        kmem_free(buf, buf_len);
+        /*
+         * Sort the leaf entries by hash value.
+         */
+        qsort(blp, INT_GET(btp->count, ARCH_CONVERT), sizeof(*blp), xfs_dir2_block_sort);
+        /*
+         * Log the leaf entry area and tail.
+         * Already logged the header in data_init, ignore needlog.
+         */
+        ASSERT(needscan == 0);
+        xfs_dir2_block_log_leaf(tp, bp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+        xfs_dir2_block_log_tail(tp, bp);
+        xfs_dir2_data_check(dp, bp);
+        xfs_da_buf_done(bp);
+        return 0;
+}
diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h
new file mode 100644
index 000000000000..5a578b84e246
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_BLOCK_H__
+#define __XFS_DIR2_BLOCK_H__
+/*
+ * xfs_dir2_block.h
+ * Directory version 2, single block format structures
+ */
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_leaf_entry;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * The single block format is as follows:
+ * xfs_dir2_data_hdr_t structure
+ * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures
+ * xfs_dir2_leaf_entry_t structures
+ * xfs_dir2_block_tail_t structure
+ */
+#define XFS_DIR2_BLOCK_MAGIC    0x58443242      /* XD2B: for one block dirs */
+typedef struct xfs_dir2_block_tail {
+        __uint32_t      count;                  /* count of leaf entries */
+        __uint32_t      stale;                  /* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+/*
+ * Generic single-block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_block {
+        xfs_dir2_data_hdr_t     hdr;            /* magic XFS_DIR2_BLOCK_MAGIC */
+        xfs_dir2_data_union_t   u[1];
+        xfs_dir2_leaf_entry_t   leaf[1];
+        xfs_dir2_block_tail_t   tail;
+} xfs_dir2_block_t;
+/*
+ * Pointer to the leaf header embedded in a data block (1-block format)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
+xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block);
+#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block)
+#else
+#define XFS_DIR2_BLOCK_TAIL_P(mp,block) \
+        (((xfs_dir2_block_tail_t *)((char *)(block) + (mp)->m_dirblksize)) - 1)
+#endif
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
+struct xfs_dir2_leaf_entry *xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp);
+#define XFS_DIR2_BLOCK_LEAF_P(btp) \
+        xfs_dir2_block_leaf_p(btp)
+#else
+#define XFS_DIR2_BLOCK_LEAF_P(btp)      \
+        (((struct xfs_dir2_leaf_entry *)(btp)) - INT_GET((btp)->count, ARCH_CONVERT))
+#endif
+/*
+ * Function declarations.
+ */
+extern int
+        xfs_dir2_block_addname(struct xfs_da_args *args);
+extern int
+        xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+                                struct uio *uio, int *eofp, struct xfs_dirent *dbp,
+                                xfs_dir2_put_t put);
+extern int
+        xfs_dir2_block_lookup(struct xfs_da_args *args);
+extern int
+        xfs_dir2_block_removename(struct xfs_da_args *args);
+extern int
+        xfs_dir2_block_replace(struct xfs_da_args *args);
+extern int
+        xfs_dir2_leaf_to_block(struct xfs_da_args *args, struct xfs_dabuf *lbp,
+                               struct xfs_dabuf *dbp);
+extern int
+        xfs_dir2_sf_to_block(struct xfs_da_args *args);
+#endif  /* __XFS_DIR2_BLOCK_H__ */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
new file mode 100644
index 000000000000..db9887a107de
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.c
@@ -0,0 +1,855 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_dir2_data.c
+ * Core data block handling routines for XFS V2 directories.
+ * See xfs_dir2_data.h for data structures.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_error.h"
+#ifdef DEBUG
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Pop an assert if we find anything bad.
+ */
+void
+xfs_dir2_data_check(
+        xfs_inode_t             *dp,            /* incore inode pointer */
+        xfs_dabuf_t             *bp)            /* data block's buffer */
+{
+        xfs_dir2_dataptr_t      addr;           /* addr for leaf lookup */
+        xfs_dir2_data_free_t    *bf;            /* bestfree table */
+        xfs_dir2_block_tail_t   *btp=NULL;      /* block tail */
+        int                     count;          /* count of entries found */
+        xfs_dir2_data_t         *d;             /* data block pointer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry */
+        xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
+        xfs_dir2_data_unused_t  *dup;           /* unused entry */
+        char                    *endp;          /* end of useful data */
+        int                     freeseen;       /* mask of bestfrees seen */
+        xfs_dahash_t            hash;           /* hash of current name */
+        int                     i;              /* leaf index */
+        int                     lastfree;       /* last entry was unused */
+        xfs_dir2_leaf_entry_t   *lep=NULL;      /* block leaf entries */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        char                    *p;             /* current data position */
+        int                     stale;          /* count of stale leaves */
+        mp = dp->i_mount;
+        d = bp->data;
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+        bf = d->hdr.bestfree;
+        p = (char *)d->u;
+        if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+                btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+                lep = XFS_DIR2_BLOCK_LEAF_P(btp);
+                endp = (char *)lep;
+        } else
+                endp = (char *)d + mp->m_dirblksize;
+        count = lastfree = freeseen = 0;
+        /*
+         * Account for zero bestfree entries.
+         */
+        if (!bf[0].length) {
+                ASSERT(!bf[0].offset);
+                freeseen |= 1 << 0;
+        }
+        if (!bf[1].length) {
+                ASSERT(!bf[1].offset);
+                freeseen |= 1 << 1;
+        }
+        if (!bf[2].length) {
+                ASSERT(!bf[2].offset);
+                freeseen |= 1 << 2;
+        }
+        ASSERT(INT_GET(bf[0].length, ARCH_CONVERT) >= INT_GET(bf[1].length, ARCH_CONVERT));
+        ASSERT(INT_GET(bf[1].length, ARCH_CONVERT) >= INT_GET(bf[2].length, ARCH_CONVERT));
+        /*
+         * Loop over the data/unused entries.
+         */
+        while (p < endp) {
+                dup = (xfs_dir2_data_unused_t *)p;
+                /*
+                 * If it's unused, look for the space in the bestfree table.
+                 * If we find it, account for that, else make sure it
+                 * doesn't need to be there.
+                 */
+                if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+                        ASSERT(lastfree == 0);
+                        ASSERT(INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT) ==
+                               (char *)dup - (char *)d);
+                        dfp = xfs_dir2_data_freefind(d, dup);
+                        if (dfp) {
+                                i = (int)(dfp - bf);
+                                ASSERT((freeseen & (1 << i)) == 0);
+                                freeseen |= 1 << i;
+                        } else
+                                ASSERT(INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(bf[2].length, ARCH_CONVERT));
+                        p += INT_GET(dup->length, ARCH_CONVERT);
+                        lastfree = 1;
+                        continue;
+                }
+                /*
+                 * It's a real entry.  Validate the fields.
+                 * If this is a block directory then make sure it's
+                 * in the leaf section of the block.
+                 * The linear search is crude but this is DEBUG code.
+                 */
+                dep = (xfs_dir2_data_entry_t *)p;
+                ASSERT(dep->namelen != 0);
+                ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
+                ASSERT(INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT) ==
+                       (char *)dep - (char *)d);
+                count++;
+                lastfree = 0;
+                if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+                        addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                (xfs_dir2_data_aoff_t)
+                                ((char *)dep - (char *)d));
+                        hash = xfs_da_hashname((char *)dep->name, dep->namelen);
+                        for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+                                if (INT_GET(lep[i].address, ARCH_CONVERT) == addr &&
+                                    INT_GET(lep[i].hashval, ARCH_CONVERT) == hash)
+                                        break;
+                        }
+                        ASSERT(i < INT_GET(btp->count, ARCH_CONVERT));
+                }
+                p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+        }
+        /*
+         * Need to have seen all the entries and all the bestfree slots.
+         */
+        ASSERT(freeseen == 7);
+        if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+                for (i = stale = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+                        if (INT_GET(lep[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                                stale++;
+                        if (i > 0)
+                                ASSERT(INT_GET(lep[i].hashval, ARCH_CONVERT) >= INT_GET(lep[i - 1].hashval, ARCH_CONVERT));
+                }
+                ASSERT(count == INT_GET(btp->count, ARCH_CONVERT) - INT_GET(btp->stale, ARCH_CONVERT));
+                ASSERT(stale == INT_GET(btp->stale, ARCH_CONVERT));
+        }
+}
+#endif
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+        xfs_dir2_data_t         *d,             /* data block */
+        xfs_dir2_data_unused_t  *dup)           /* data unused entry */
+{
+        xfs_dir2_data_free_t    *dfp;           /* bestfree entry */
+        xfs_dir2_data_aoff_t    off;            /* offset value needed */
+#if defined(DEBUG) && defined(__KERNEL__)
+        int                     matched;        /* matched the value */
+        int                     seenzero;       /* saw a 0 bestfree entry */
+#endif
+        off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d);
+#if defined(DEBUG) && defined(__KERNEL__)
+        /*
+         * Validate some consistency in the bestfree table.
+         * Check order, non-overlapping entries, and if we find the
+         * one we're looking for it has to be exact.
+         */
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+        for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0;
+             dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+             dfp++) {
+                if (!dfp->offset) {
+                        ASSERT(!dfp->length);
+                        seenzero = 1;
+                        continue;
+                }
+                ASSERT(seenzero == 0);
+                if (INT_GET(dfp->offset, ARCH_CONVERT) == off) {
+                        matched = 1;
+                        ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(dup->length, ARCH_CONVERT));
+                } else if (off < INT_GET(dfp->offset, ARCH_CONVERT))
+                        ASSERT(off + INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(dfp->offset, ARCH_CONVERT));
+                else
+                        ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) + INT_GET(dfp->length, ARCH_CONVERT) <= off);
+                ASSERT(matched || INT_GET(dfp->length, ARCH_CONVERT) >= INT_GET(dup->length, ARCH_CONVERT));
+                if (dfp > &d->hdr.bestfree[0])
+                        ASSERT(INT_GET(dfp[-1].length, ARCH_CONVERT) >= INT_GET(dfp[0].length, ARCH_CONVERT));
+        }
+#endif
+        /*
+         * If this is smaller than the smallest bestfree entry,
+         * it can't be there since they're sorted.
+         */
+        if (INT_GET(dup->length, ARCH_CONVERT) < INT_GET(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length, ARCH_CONVERT))
+                return NULL;
+        /*
+         * Look at the three bestfree entries for our guy.
+         */
+        for (dfp = &d->hdr.bestfree[0];
+             dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+             dfp++) {
+                if (!dfp->offset)
+                        return NULL;
+                if (INT_GET(dfp->offset, ARCH_CONVERT) == off)
+                        return dfp;
+        }
+        /*
+         * Didn't find it.  This only happens if there are duplicate lengths.
+         */
+        return NULL;
+}
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *                          /* entry inserted */
+xfs_dir2_data_freeinsert(
+        xfs_dir2_data_t         *d,             /* data block pointer */
+        xfs_dir2_data_unused_t  *dup,           /* unused space */
+        int                     *loghead)       /* log the data header (out) */
+{
+        xfs_dir2_data_free_t    *dfp;           /* bestfree table pointer */
+        xfs_dir2_data_free_t    new;            /* new bestfree entry */
+#ifdef __KERNEL__
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+        dfp = d->hdr.bestfree;
+        INT_COPY(new.length, dup->length, ARCH_CONVERT);
+        INT_SET(new.offset, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dup - (char *)d));
+        /*
+         * Insert at position 0, 1, or 2; or not at all.
+         */
+        if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[0].length, ARCH_CONVERT)) {
+                dfp[2] = dfp[1];
+                dfp[1] = dfp[0];
+                dfp[0] = new;
+                *loghead = 1;
+                return &dfp[0];
+        }
+        if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[1].length, ARCH_CONVERT)) {
+                dfp[2] = dfp[1];
+                dfp[1] = new;
+                *loghead = 1;
+                return &dfp[1];
+        }
+        if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[2].length, ARCH_CONVERT)) {
+                dfp[2] = new;
+                *loghead = 1;
+                return &dfp[2];
+        }
+        return NULL;
+}
+/*
+ * Remove a bestfree entry from the table.
+ */
+void
+xfs_dir2_data_freeremove(
+        xfs_dir2_data_t         *d,             /* data block pointer */
+        xfs_dir2_data_free_t    *dfp,           /* bestfree entry pointer */
+        int                     *loghead)       /* out: log data header */
+{
+#ifdef __KERNEL__
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+        /*
+         * It's the first entry, slide the next 2 up.
+         */
+        if (dfp == &d->hdr.bestfree[0]) {
+                d->hdr.bestfree[0] = d->hdr.bestfree[1];
+                d->hdr.bestfree[1] = d->hdr.bestfree[2];
+        }
+        /*
+         * It's the second entry, slide the 3rd entry up.
+         */
+        else if (dfp == &d->hdr.bestfree[1])
+                d->hdr.bestfree[1] = d->hdr.bestfree[2];
+        /*
+         * Must be the last entry.
+         */
+        else
+                ASSERT(dfp == &d->hdr.bestfree[2]);
+        /*
+         * Clear the 3rd entry, must be zero now.
+         */
+        d->hdr.bestfree[2].length = 0;
+        d->hdr.bestfree[2].offset = 0;
+        *loghead = 1;
+}
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+        xfs_mount_t             *mp,            /* filesystem mount point */
+        xfs_dir2_data_t         *d,             /* data block pointer */
+        int                     *loghead,       /* out: log data header */
+        char                    *aendp)         /* in: caller's endp */
+{
+        xfs_dir2_block_tail_t   *btp;           /* block tail */
+        xfs_dir2_data_entry_t   *dep;           /* active data entry */
+        xfs_dir2_data_unused_t  *dup;           /* unused data entry */
+        char                    *endp;          /* end of block's data */
+        char                    *p;             /* current entry pointer */
+#ifdef __KERNEL__
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+        /*
+         * Start by clearing the table.
+         */
+        memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree));
+        *loghead = 1;
+        /*
+         * Set up pointers.
+         */
+        p = (char *)d->u;
+        if (aendp)
+                endp = aendp;
+        else if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+                btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+                endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
+        } else
+                endp = (char *)d + mp->m_dirblksize;
+        /*
+         * Loop over the block's entries.
+         */
+        while (p < endp) {
+                dup = (xfs_dir2_data_unused_t *)p;
+                /*
+                 * If it's a free entry, insert it.
+                 */
+                if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+                        ASSERT((char *)dup - (char *)d ==
+                               INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT));
+                        xfs_dir2_data_freeinsert(d, dup, loghead);
+                        p += INT_GET(dup->length, ARCH_CONVERT);
+                }
+                /*
+                 * For active entries, check their tags and skip them.
+                 */
+                else {
+                        dep = (xfs_dir2_data_entry_t *)p;
+                        ASSERT((char *)dep - (char *)d ==
+                               INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT));
+                        p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+                }
+        }
+}
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int                                             /* error */
+xfs_dir2_data_init(
+        xfs_da_args_t           *args,          /* directory operation args */
+        xfs_dir2_db_t           blkno,          /* logical dir block number */
+        xfs_dabuf_t             **bpp)          /* output block buffer */
+{
+        xfs_dabuf_t             *bp;            /* block buffer */
+        xfs_dir2_data_t         *d;             /* pointer to block */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* unused entry pointer */
+        int                     error;          /* error return value */
+        int                     i;              /* bestfree index */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        int                     t;              /* temp */
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Get the buffer set up for the block.
+         */
+        error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp,
+                XFS_DATA_FORK);
+        if (error) {
+                return error;
+        }
+        ASSERT(bp != NULL);
+        /*
+         * Initialize the header.
+         */
+        d = bp->data;
+        INT_SET(d->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
+        INT_SET(d->hdr.bestfree[0].offset, ARCH_CONVERT, (xfs_dir2_data_off_t)sizeof(d->hdr));
+        for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+                d->hdr.bestfree[i].length = 0;
+                d->hdr.bestfree[i].offset = 0;
+        }
+        /*
+         * Set up an unused entry for the block's body.
+         */
+        dup = &d->u[0].unused;
+        INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+        t=mp->m_dirblksize - (uint)sizeof(d->hdr);
+        INT_SET(d->hdr.bestfree[0].length, ARCH_CONVERT, t);
+        INT_SET(dup->length, ARCH_CONVERT, t);
+        INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT,
+                (xfs_dir2_data_off_t)((char *)dup - (char *)d));
+        /*
+         * Log it and return it.
+         */
+        xfs_dir2_data_log_header(tp, bp);
+        xfs_dir2_data_log_unused(tp, bp, dup);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp,            /* block buffer */
+        xfs_dir2_data_entry_t   *dep)           /* data entry pointer */
+{
+        xfs_dir2_data_t         *d;             /* data block pointer */
+        d = bp->data;
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+        xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
+                (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) -
+                       (char *)d - 1));
+}
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp)            /* block buffer */
+{
+        xfs_dir2_data_t         *d;             /* data block pointer */
+        d = bp->data;
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+        xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d),
+                (uint)(sizeof(d->hdr) - 1));
+}
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp,            /* block buffer */
+        xfs_dir2_data_unused_t  *dup)           /* data unused pointer */
+{
+        xfs_dir2_data_t         *d;             /* data block pointer */
+        d = bp->data;
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+        /*
+         * Log the first part of the unused entry.
+         */
+        xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d),
+                (uint)((char *)&dup->length + sizeof(dup->length) -
+                       1 - (char *)d));
+        /*
+         * Log the end (tag) of the unused entry.
+         */
+        xfs_da_log_buf(tp, bp,
+                (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d),
+                (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d +
+                       sizeof(xfs_dir2_data_off_t) - 1));
+}
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp,            /* block buffer */
+        xfs_dir2_data_aoff_t    offset,         /* starting byte offset */
+        xfs_dir2_data_aoff_t    len,            /* length in bytes */
+        int                     *needlogp,      /* out: log header */
+        int                     *needscanp)     /* out: regen bestfree */
+{
+        xfs_dir2_data_t         *d;             /* data block pointer */
+        xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
+        char                    *endptr;        /* end of data area */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needscan;       /* need to regen bestfree */
+        xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
+        xfs_dir2_data_unused_t  *postdup;       /* unused entry after us */
+        xfs_dir2_data_unused_t  *prevdup;       /* unused entry before us */
+        mp = tp->t_mountp;
+        d = bp->data;
+        /*
+         * Figure out where the end of the data area is.
+         */
+        if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC)
+                endptr = (char *)d + mp->m_dirblksize;
+        else {
+                xfs_dir2_block_tail_t   *btp;   /* block tail */
+                ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+                btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+                endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
+        }
+        /*
+         * If this isn't the start of the block, then back up to
+         * the previous entry and see if it's free.
+         */
+        if (offset > sizeof(d->hdr)) {
+                xfs_dir2_data_off_t     *tagp;  /* tag just before us */
+                tagp = (xfs_dir2_data_off_t *)((char *)d + offset) - 1;
+                prevdup = (xfs_dir2_data_unused_t *)((char *)d + INT_GET(*tagp, ARCH_CONVERT));
+                if (INT_GET(prevdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+                        prevdup = NULL;
+        } else
+                prevdup = NULL;
+        /*
+         * If this isn't the end of the block, see if the entry after
+         * us is free.
+         */
+        if ((char *)d + offset + len < endptr) {
+                postdup =
+                        (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+                if (INT_GET(postdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+                        postdup = NULL;
+        } else
+                postdup = NULL;
+        ASSERT(*needscanp == 0);
+        needscan = 0;
+        /*
+         * Previous and following entries are both free,
+         * merge everything into a single free entry.
+         */
+        if (prevdup && postdup) {
+                xfs_dir2_data_free_t    *dfp2;  /* another bestfree pointer */
+                /*
+                 * See if prevdup and/or postdup are in bestfree table.
+                 */
+                dfp = xfs_dir2_data_freefind(d, prevdup);
+                dfp2 = xfs_dir2_data_freefind(d, postdup);
+                /*
+                 * We need a rescan unless there are exactly 2 free entries
+                 * namely our two.  Then we know what's happening, otherwise
+                 * since the third bestfree is there, there might be more
+                 * entries.
+                 */
+                needscan = d->hdr.bestfree[2].length;
+                /*
+                 * Fix up the new big freespace.
+                 */
+                INT_MOD(prevdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, prevdup);
+                if (!needscan) {
+                        /*
+                         * Has to be the case that entries 0 and 1 are
+                         * dfp and dfp2 (don't know which is which), and
+                         * entry 2 is empty.
+                         * Remove entry 1 first then entry 0.
+                         */
+                        ASSERT(dfp && dfp2);
+                        if (dfp == &d->hdr.bestfree[1]) {
+                                dfp = &d->hdr.bestfree[0];
+                                ASSERT(dfp2 == dfp);
+                                dfp2 = &d->hdr.bestfree[1];
+                        }
+                        xfs_dir2_data_freeremove(d, dfp2, needlogp);
+                        xfs_dir2_data_freeremove(d, dfp, needlogp);
+                        /*
+                         * Now insert the new entry.
+                         */
+                        dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+                        ASSERT(dfp == &d->hdr.bestfree[0]);
+                        ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(prevdup->length, ARCH_CONVERT));
+                        ASSERT(!dfp[1].length);
+                        ASSERT(!dfp[2].length);
+                }
+        }
+        /*
+         * The entry before us is free, merge with it.
+         */
+        else if (prevdup) {
+                dfp = xfs_dir2_data_freefind(d, prevdup);
+                INT_MOD(prevdup->length, ARCH_CONVERT, len);
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(prevdup), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, prevdup);
+                /*
+                 * If the previous entry was in the table, the new entry
+                 * is longer, so it will be in the table too.  Remove
+                 * the old one and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(d, dfp, needlogp);
+                        (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+                }
+                /*
+                 * Otherwise we need a scan if the new entry is big enough.
+                 */
+                else
+                        needscan = INT_GET(prevdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+        }
+        /*
+         * The following entry is free, merge with it.
+         */
+        else if (postdup) {
+                dfp = xfs_dir2_data_freefind(d, postdup);
+                newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+                INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+                INT_SET(newdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, newdup);
+                /*
+                 * If the following entry was in the table, the new entry
+                 * is longer, so it will be in the table too.  Remove
+                 * the old one and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(d, dfp, needlogp);
+                        (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+                }
+                /*
+                 * Otherwise we need a scan if the new entry is big enough.
+                 */
+                else
+                        needscan = INT_GET(newdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+        }
+        /*
+         * Neither neighbor is free.  Make a new entry.
+         */
+        else {
+                newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+                INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+                INT_SET(newdup->length, ARCH_CONVERT, len);
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, newdup);
+                (void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+        }
+        *needscanp = needscan;
+}
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp,            /* data block buffer */
+        xfs_dir2_data_unused_t  *dup,           /* unused entry */
+        xfs_dir2_data_aoff_t    offset,         /* starting offset to use */
+        xfs_dir2_data_aoff_t    len,            /* length to use */
+        int                     *needlogp,      /* out: need to log header */
+        int                     *needscanp)     /* out: need regen bestfree */
+{
+        xfs_dir2_data_t         *d;             /* data block */
+        xfs_dir2_data_free_t    *dfp;           /* bestfree pointer */
+        int                     matchback;      /* matches end of freespace */
+        int                     matchfront;     /* matches start of freespace */
+        int                     needscan;       /* need to regen bestfree */
+        xfs_dir2_data_unused_t  *newdup;        /* new unused entry */
+        xfs_dir2_data_unused_t  *newdup2;       /* another new unused entry */
+        int                     oldlen;         /* old unused entry's length */
+        d = bp->data;
+        ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+               INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+        ASSERT(INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG);
+        ASSERT(offset >= (char *)dup - (char *)d);
+        ASSERT(offset + len <= (char *)dup + INT_GET(dup->length, ARCH_CONVERT) - (char *)d);
+        ASSERT((char *)dup - (char *)d == INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P(dup), ARCH_CONVERT));
+        /*
+         * Look up the entry in the bestfree table.
+         */
+        dfp = xfs_dir2_data_freefind(d, dup);
+        oldlen = INT_GET(dup->length, ARCH_CONVERT);
+        ASSERT(dfp || oldlen <= INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT));
+        /*
+         * Check for alignment with front and back of the entry.
+         */
+        matchfront = (char *)dup - (char *)d == offset;
+        matchback = (char *)dup + oldlen - (char *)d == offset + len;
+        ASSERT(*needscanp == 0);
+        needscan = 0;
+        /*
+         * If we matched it exactly we just need to get rid of it from
+         * the bestfree table.
+         */
+        if (matchfront && matchback) {
+                if (dfp) {
+                        needscan = d->hdr.bestfree[2].offset;
+                        if (!needscan)
+                                xfs_dir2_data_freeremove(d, dfp, needlogp);
+                }
+        }
+        /*
+         * We match the first part of the entry.
+         * Make a new entry with the remaining freespace.
+         */
+        else if (matchfront) {
+                newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+                INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+                INT_SET(newdup->length, ARCH_CONVERT, oldlen - len);
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, newdup);
+                /*
+                 * If it was in the table, remove it and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(d, dfp, needlogp);
+                        dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+                        ASSERT(dfp != NULL);
+                        ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
+                        ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+                        /*
+                         * If we got inserted at the last slot,
+                         * that means we don't know if there was a better
+                         * choice for the last slot, or not.  Rescan.
+                         */
+                        needscan = dfp == &d->hdr.bestfree[2];
+                }
+        }
+        /*
+         * We match the last part of the entry.
+         * Trim the allocated space off the tail of the entry.
+         */
+        else if (matchback) {
+                newdup = dup;
+                INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
+                        (((char *)d + offset) - (char *)newdup));
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, newdup);
+                /*
+                 * If it was in the table, remove it and add the new one.
+                 */
+                if (dfp) {
+                        xfs_dir2_data_freeremove(d, dfp, needlogp);
+                        dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+                        ASSERT(dfp != NULL);
+                        ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
+                        ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+                        /*
+                         * If we got inserted at the last slot,
+                         * that means we don't know if there was a better
+                         * choice for the last slot, or not.  Rescan.
+                         */
+                        needscan = dfp == &d->hdr.bestfree[2];
+                }
+        }
+        /*
+         * Poking out the middle of an entry.
+         * Make two new entries.
+         */
+        else {
+                newdup = dup;
+                INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
+                        (((char *)d + offset) - (char *)newdup));
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, newdup);
+                newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+                INT_SET(newdup2->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+                INT_SET(newdup2->length, ARCH_CONVERT, oldlen - len - INT_GET(newdup->length, ARCH_CONVERT));
+                INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P(newdup2), ARCH_CONVERT,
+                        (xfs_dir2_data_off_t)((char *)newdup2 - (char *)d));
+                xfs_dir2_data_log_unused(tp, bp, newdup2);
+                /*
+                 * If the old entry was in the table, we need to scan
+                 * if the 3rd entry was valid, since these entries
+                 * are smaller than the old one.
+                 * If we don't need to scan that means there were 1 or 2
+                 * entries in the table, and removing the old and adding
+                 * the 2 new will work.
+                 */
+                if (dfp) {
+                        needscan = d->hdr.bestfree[2].length;
+                        if (!needscan) {
+                                xfs_dir2_data_freeremove(d, dfp, needlogp);
+                                (void)xfs_dir2_data_freeinsert(d, newdup,
+                                        needlogp);
+                                (void)xfs_dir2_data_freeinsert(d, newdup2,
+                                        needlogp);
+                        }
+                }
+        }
+        *needscanp = needscan;
+}
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
new file mode 100644
index 000000000000..3f02294ccff0
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.h
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_DATA_H__
+#define __XFS_DIR2_DATA_H__
+/*
+ * Directory format 2, data block structures.
+ */
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_trans;
+/*
+ * Constants.
+ */
+#define XFS_DIR2_DATA_MAGIC     0x58443244      /* XD2D: for multiblock dirs */
+#define XFS_DIR2_DATA_ALIGN_LOG 3               /* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN     (1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG  0xffff
+#define XFS_DIR2_DATA_FD_COUNT  3
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32gb.
+ */
+#define XFS_DIR2_SPACE_SIZE     (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE     0
+#define XFS_DIR2_DATA_OFFSET    (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_DATA_FIRSTDB(mp)       \
+        XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET)
+/*
+ * Offsets of . and .. in data space (always block 0)
+ */
+#define XFS_DIR2_DATA_DOT_OFFSET        \
+        ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
+#define XFS_DIR2_DATA_DOTDOT_OFFSET     \
+        (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1))
+#define XFS_DIR2_DATA_FIRST_OFFSET              \
+        (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2))
+/*
+ * Structures.
+ */
+/*
+ * Describe a free area in the data block.
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+        xfs_dir2_data_off_t     offset;         /* start of freespace */
+        xfs_dir2_data_off_t     length;         /* length of freespace */
+} xfs_dir2_data_free_t;
+/*
+ * Header for the data blocks.
+ * Always at the beginning of a directory-sized block.
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+        __uint32_t              magic;          /* XFS_DIR2_DATA_MAGIC */
+                                                /* or XFS_DIR2_BLOCK_MAGIC */
+        xfs_dir2_data_free_t    bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+/*
+ * Active entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_entry {
+        xfs_ino_t               inumber;        /* inode number */
+        __uint8_t               namelen;        /* name length */
+        __uint8_t               name[1];        /* name bytes, no null */
+                                                /* variable offset */
+        xfs_dir2_data_off_t     tag;            /* starting offset of us */
+} xfs_dir2_data_entry_t;
+/*
+ * Unused entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_unused {
+        __uint16_t              freetag;        /* XFS_DIR2_DATA_FREE_TAG */
+        xfs_dir2_data_off_t     length;         /* total free length */
+                                                /* variable offset */
+        xfs_dir2_data_off_t     tag;            /* starting offset of us */
+} xfs_dir2_data_unused_t;
+typedef union {
+        xfs_dir2_data_entry_t   entry;
+        xfs_dir2_data_unused_t  unused;
+} xfs_dir2_data_union_t;
+/*
+ * Generic data block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_data {
+        xfs_dir2_data_hdr_t     hdr;            /* magic XFS_DIR2_DATA_MAGIC */
+        xfs_dir2_data_union_t   u[1];
+} xfs_dir2_data_t;
+/*
+ * Macros.
+ */
+/*
+ * Size of a data entry.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTSIZE)
+int xfs_dir2_data_entsize(int n);
+#define XFS_DIR2_DATA_ENTSIZE(n)        xfs_dir2_data_entsize(n)
+#else
+#define XFS_DIR2_DATA_ENTSIZE(n)        \
+        ((int)(roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
+                 (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN)))
+#endif
+/*
+ * Pointer to an entry's tag word.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
+xfs_dir2_data_off_t *xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep);
+#define XFS_DIR2_DATA_ENTRY_TAG_P(dep)  xfs_dir2_data_entry_tag_p(dep)
+#else
+#define XFS_DIR2_DATA_ENTRY_TAG_P(dep)  \
+        ((xfs_dir2_data_off_t *)\
+         ((char *)(dep) + XFS_DIR2_DATA_ENTSIZE((dep)->namelen) - \
+          (uint)sizeof(xfs_dir2_data_off_t)))
+#endif
+/*
+ * Pointer to a freespace's tag word.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
+xfs_dir2_data_off_t *xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup);
+#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \
+        xfs_dir2_data_unused_tag_p(dup)
+#else
+#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \
+        ((xfs_dir2_data_off_t *)\
+         ((char *)(dup) + INT_GET((dup)->length, ARCH_CONVERT) \
+                        - (uint)sizeof(xfs_dir2_data_off_t)))
+#endif
+/*
+ * Function declarations.
+ */
+#ifdef DEBUG
+extern void
+        xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
+#else
+#define xfs_dir2_data_check(dp,bp)
+#endif
+extern xfs_dir2_data_free_t *
+        xfs_dir2_data_freefind(xfs_dir2_data_t *d,
+                               xfs_dir2_data_unused_t *dup);
+extern xfs_dir2_data_free_t *
+        xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
+                                 xfs_dir2_data_unused_t *dup, int *loghead);
+extern void
+        xfs_dir2_data_freeremove(xfs_dir2_data_t *d,
+                                 xfs_dir2_data_free_t *dfp, int *loghead);
+extern void
+        xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
+                               int *loghead, char *aendp);
+extern int
+        xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+                           struct xfs_dabuf **bpp);
+extern void
+        xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
+                                xfs_dir2_data_entry_t *dep);
+extern void
+        xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
+extern void
+        xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
+                                 xfs_dir2_data_unused_t *dup);
+extern void
+        xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+                                xfs_dir2_data_aoff_t offset,
+                                xfs_dir2_data_aoff_t len, int *needlogp,
+                                int *needscanp);
+extern void
+        xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+                               xfs_dir2_data_unused_t *dup,
+                               xfs_dir2_data_aoff_t offset,
+                               xfs_dir2_data_aoff_t len, int *needlogp,
+                               int *needscanp);
+#endif  /* __XFS_DIR2_DATA_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000000..262d1e86df30
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -0,0 +1,1896 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_dir2_leaf.c
+ * XFS directory version 2 implementation - single leaf form
+ * see xfs_dir2_leaf.h for data structures.
+ * These directories have multiple XFS_DIR2_DATA blocks and one
+ * XFS_DIR2_LEAF1 block containing the hash table and freespace map.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+/*
+ * Local function declarations.
+ */
+#ifdef DEBUG
+static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leaf_check(dp, bp)
+#endif
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
+                                    int *indexp, xfs_dabuf_t **dbpp);
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int                                             /* error */
+xfs_dir2_block_to_leaf(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             *dbp)           /* input block's buffer */
+{
+        xfs_dir2_data_off_t     *bestsp;        /* leaf's bestsp entries */
+        xfs_dablk_t             blkno;          /* leaf block's bno */
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_leaf_entry_t   *blp;           /* block's leaf entries */
+        xfs_dir2_block_tail_t   *btp;           /* block's tail */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        xfs_dabuf_t             *lbp;           /* leaf block's buffer */
+        xfs_dir2_db_t           ldb;            /* leaf block's bno */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf's tail */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log block header */
+        int                     needscan;       /* need to rescan bestfree */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args_b("block_to_leaf", args, dbp);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Add the leaf block to the inode.
+         * This interface will only put blocks in the leaf/node range.
+         * Since that's empty now, we'll get the root (block 0 in range).
+         */
+        if ((error = xfs_da_grow_inode(args, &blkno))) {
+                return error;
+        }
+        ldb = XFS_DIR2_DA_TO_DB(mp, blkno);
+        ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+        /*
+         * Initialize the leaf block, get a buffer for it.
+         */
+        if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+                return error;
+        }
+        ASSERT(lbp != NULL);
+        leaf = lbp->data;
+        block = dbp->data;
+        xfs_dir2_data_check(dp, dbp);
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        /*
+         * Set the counts in the leaf header.
+         */
+        INT_COPY(leaf->hdr.count, btp->count, ARCH_CONVERT); /* INT_: type change */
+        INT_COPY(leaf->hdr.stale, btp->stale, ARCH_CONVERT); /* INT_: type change */
+        /*
+         * Could compact these but I think we always do the conversion
+         * after squeezing out stale entries.
+         */
+        memcpy(leaf->ents, blp, INT_GET(btp->count, ARCH_CONVERT) * sizeof(xfs_dir2_leaf_entry_t));
+        xfs_dir2_leaf_log_ents(tp, lbp, 0, INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1);
+        needscan = 0;
+        needlog = 1;
+        /*
+         * Make the space formerly occupied by the leaf entries and block
+         * tail be free.
+         */
+        xfs_dir2_data_make_free(tp, dbp,
+                (xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+                (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize -
+                                       (char *)blp),
+                &needlog, &needscan);
+        /*
+         * Fix up the block header, make it a data block.
+         */
+        INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
+        if (needscan)
+                xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+                        NULL);
+        /*
+         * Set up leaf tail and bests table.
+         */
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        INT_SET(ltp->bestcount, ARCH_CONVERT, 1);
+        bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
+        INT_COPY(bestsp[0], block->hdr.bestfree[0].length, ARCH_CONVERT);
+        /*
+         * Log the data header and leaf bests table.
+         */
+        if (needlog)
+                xfs_dir2_data_log_header(tp, dbp);
+        xfs_dir2_leaf_check(dp, lbp);
+        xfs_dir2_data_check(dp, dbp);
+        xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+        xfs_da_buf_done(lbp);
+        return 0;
+}
+/*
+ * Add an entry to a leaf form directory.
+ */
+int                                             /* error */
+xfs_dir2_leaf_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_dir2_data_off_t     *bestsp;        /* freespace table in leaf */
+        int                     compact;        /* need to compact leaves */
+        xfs_dir2_data_t         *data;          /* data block structure */
+        xfs_dabuf_t             *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* data unused entry */
+        int                     error;          /* error return value */
+        int                     grown;          /* allocated new data block */
+        int                     highstale;      /* index of next stale leaf */
+        int                     i;              /* temporary, index */
+        int                     index;          /* leaf table position */
+        xfs_dabuf_t             *lbp;           /* leaf's buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        int                     length;         /* length of new entry */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry table pointer */
+        int                     lfloglow;       /* low leaf logging index */
+        int                     lfloghigh;      /* high leaf logging index */
+        int                     lowstale;       /* index of prev stale leaf */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail pointer */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needbytes;      /* leaf block bytes needed */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data free */
+        xfs_dir2_data_off_t     *tagp;          /* end of data entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_db_t           use_block;      /* data block number */
+        xfs_dir2_trace_args("leaf_addname", args);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        /*
+         * Read the leaf block.
+         */
+        error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+                XFS_DATA_FORK);
+        if (error) {
+                return error;
+        }
+        ASSERT(lbp != NULL);
+        /*
+         * Look up the entry by hash value and name.
+         * We know it's not there, our caller has already done a lookup.
+         * So the index is of the entry to insert in front of.
+         * But if there are dup hash values the index is of the first of those.
+         */
+        index = xfs_dir2_leaf_search_hash(args, lbp);
+        leaf = lbp->data;
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
+        length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+        /*
+         * See if there are any entries with the same hash value
+         * and space in their block for the new entry.
+         * This is good because it puts multiple same-hash value entries
+         * in a data block, improving the lookup of those entries.
+         */
+        for (use_block = -1, lep = &leaf->ents[index];
+             index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+             index++, lep++) {
+                if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                i = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+                ASSERT(i < INT_GET(ltp->bestcount, ARCH_CONVERT));
+                ASSERT(INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF);
+                if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+                        use_block = i;
+                        break;
+                }
+        }
+        /*
+         * Didn't find a block yet, linear search all the data blocks.
+         */
+        if (use_block == -1) {
+                for (i = 0; i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++) {
+                        /*
+                         * Remember a block we see that's missing.
+                         */
+                        if (INT_GET(bestsp[i], ARCH_CONVERT) == NULLDATAOFF && use_block == -1)
+                                use_block = i;
+                        else if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+                                use_block = i;
+                                break;
+                        }
+                }
+        }
+        /*
+         * How many bytes do we need in the leaf block?
+         */
+        needbytes =
+                (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) +
+                (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0]));
+        /*
+         * Now kill use_block if it refers to a missing block, so we
+         * can use it as an indication of allocation needed.
+         */
+        if (use_block != -1 && INT_GET(bestsp[use_block], ARCH_CONVERT) == NULLDATAOFF)
+                use_block = -1;
+        /*
+         * If we don't have enough free bytes but we can make enough
+         * by compacting out stale entries, we'll do that.
+         */
+        if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] < needbytes &&
+            INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1) {
+                compact = 1;
+        }
+        /*
+         * Otherwise if we don't have enough free bytes we need to
+         * convert to node form.
+         */
+        else if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <
+                 needbytes) {
+                /*
+                 * Just checking or no space reservation, give up.
+                 */
+                if (args->justcheck || args->total == 0) {
+                        xfs_da_brelse(tp, lbp);
+                        return XFS_ERROR(ENOSPC);
+                }
+                /*
+                 * Convert to node form.
+                 */
+                error = xfs_dir2_leaf_to_node(args, lbp);
+                xfs_da_buf_done(lbp);
+                if (error)
+                        return error;
+                /*
+                 * Then add the new entry.
+                 */
+                return xfs_dir2_node_addname(args);
+        }
+        /*
+         * Otherwise it will fit without compaction.
+         */
+        else
+                compact = 0;
+        /*
+         * If just checking, then it will fit unless we needed to allocate
+         * a new data block.
+         */
+        if (args->justcheck) {
+                xfs_da_brelse(tp, lbp);
+                return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+        }
+        /*
+         * If no allocations are allowed, return now before we've
+         * changed anything.
+         */
+        if (args->total == 0 && use_block == -1) {
+                xfs_da_brelse(tp, lbp);
+                return XFS_ERROR(ENOSPC);
+        }
+        /*
+         * Need to compact the leaf entries, removing stale ones.
+         * Leave one stale entry behind - the one closest to our
+         * insertion index - and we'll shift that one to our insertion
+         * point later.
+         */
+        if (compact) {
+                xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
+                        &lfloglow, &lfloghigh);
+        }
+        /*
+         * There are stale entries, so we'll need log-low and log-high
+         * impossibly bad values later.
+         */
+        else if (INT_GET(leaf->hdr.stale, ARCH_CONVERT)) {
+                lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                lfloghigh = -1;
+        }
+        /*
+         * If there was no data block space found, we need to allocate
+         * a new one.
+         */
+        if (use_block == -1) {
+                /*
+                 * Add the new data block.
+                 */
+                if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+                                &use_block))) {
+                        xfs_da_brelse(tp, lbp);
+                        return error;
+                }
+                /*
+                 * Initialize the block.
+                 */
+                if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+                        xfs_da_brelse(tp, lbp);
+                        return error;
+                }
+                /*
+                 * If we're adding a new data block on the end we need to
+                 * extend the bests table.  Copy it up one entry.
+                 */
+                if (use_block >= INT_GET(ltp->bestcount, ARCH_CONVERT)) {
+                        bestsp--;
+                        memmove(&bestsp[0], &bestsp[1],
+                                INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(bestsp[0]));
+                        INT_MOD(ltp->bestcount, ARCH_CONVERT, +1);
+                        xfs_dir2_leaf_log_tail(tp, lbp);
+                        xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+                }
+                /*
+                 * If we're filling in a previously empty block just log it.
+                 */
+                else
+                        xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+                data = dbp->data;
+                INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+                grown = 1;
+        }
+        /*
+         * Already had space in some data block.
+         * Just read that one in.
+         */
+        else {
+                if ((error =
+                    xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block),
+                            -1, &dbp, XFS_DATA_FORK))) {
+                        xfs_da_brelse(tp, lbp);
+                        return error;
+                }
+                data = dbp->data;
+                grown = 0;
+        }
+        xfs_dir2_data_check(dp, dbp);
+        /*
+         * Point to the biggest freespace in our data block.
+         */
+        dup = (xfs_dir2_data_unused_t *)
+              ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
+        ASSERT(INT_GET(dup->length, ARCH_CONVERT) >= length);
+        needscan = needlog = 0;
+        /*
+         * Mark the initial part of our freespace in use for the new entry.
+         */
+        xfs_dir2_data_use_free(tp, dbp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+                &needlog, &needscan);
+        /*
+         * Initialize our new entry (at last).
+         */
+        dep = (xfs_dir2_data_entry_t *)dup;
+        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->namelen = args->namelen;
+        memcpy(dep->name, args->name, dep->namelen);
+        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+        INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
+        /*
+         * Need to scan fix up the bestfree table.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+        /*
+         * Need to log the data block's header.
+         */
+        if (needlog)
+                xfs_dir2_data_log_header(tp, dbp);
+        xfs_dir2_data_log_entry(tp, dbp, dep);
+        /*
+         * If the bests table needs to be changed, do it.
+         * Log the change unless we've already done that.
+         */
+        if (INT_GET(bestsp[use_block], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+                INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+                if (!grown)
+                        xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+        }
+        /*
+         * Now we need to make room to insert the leaf entry.
+         * If there are no stale entries, we just insert a hole at index.
+         */
+        if (!leaf->hdr.stale) {
+                /*
+                 * lep is still good as the index leaf entry.
+                 */
+                if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+                        memmove(lep + 1, lep,
+                                (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
+                /*
+                 * Record low and high logging indices for the leaf.
+                 */
+                lfloglow = index;
+                lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
+        }
+        /*
+         * There are stale entries.
+         * We will use one of them for the new entry.
+         * It's probably not at the right location, so we'll have to
+         * shift some up or down first.
+         */
+        else {
+                /*
+                 * If we didn't compact before, we need to find the nearest
+                 * stale entries before and after our insertion point.
+                 */
+                if (compact == 0) {
+                        /*
+                         * Find the first stale entry before the insertion
+                         * point, if any.
+                         */
+                        for (lowstale = index - 1;
+                             lowstale >= 0 &&
+                                INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
+                                XFS_DIR2_NULL_DATAPTR;
+                             lowstale--)
+                                continue;
+                        /*
+                         * Find the next stale entry at or after the insertion
+                         * point, if any.   Stop if we go so far that the
+                         * lowstale entry would be better.
+                         */
+                        for (highstale = index;
+                             highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+                                INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
+                                XFS_DIR2_NULL_DATAPTR &&
+                                (lowstale < 0 ||
+                                 index - lowstale - 1 >= highstale - index);
+                             highstale++)
+                                continue;
+                }
+                /*
+                 * If the low one is better, use it.
+                 */
+                if (lowstale >= 0 &&
+                    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+                     index - lowstale - 1 < highstale - index)) {
+                        ASSERT(index - lowstale - 1 >= 0);
+                        ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
+                               XFS_DIR2_NULL_DATAPTR);
+                        /*
+                         * Copy entries up to cover the stale entry
+                         * and make room for the new entry.
+                         */
+                        if (index - lowstale - 1 > 0)
+                                memmove(&leaf->ents[lowstale],
+                                        &leaf->ents[lowstale + 1],
+                                        (index - lowstale - 1) * sizeof(*lep));
+                        lep = &leaf->ents[index - 1];
+                        lfloglow = MIN(lowstale, lfloglow);
+                        lfloghigh = MAX(index - 1, lfloghigh);
+                }
+                /*
+                 * The high one is better, so use that one.
+                 */
+                else {
+                        ASSERT(highstale - index >= 0);
+                        ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
+                               XFS_DIR2_NULL_DATAPTR);
+                        /*
+                         * Copy entries down to copver the stale entry
+                         * and make room for the new entry.
+                         */
+                        if (highstale - index > 0)
+                                memmove(&leaf->ents[index + 1],
+                                        &leaf->ents[index],
+                                        (highstale - index) * sizeof(*lep));
+                        lep = &leaf->ents[index];
+                        lfloglow = MIN(index, lfloglow);
+                        lfloghigh = MAX(highstale, lfloghigh);
+                }
+                INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+        }
+        /*
+         * Fill in the new leaf entry.
+         */
+        INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
+        INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, INT_GET(*tagp, ARCH_CONVERT)));
+        /*
+         * Log the leaf fields and give up the buffers.
+         */
+        xfs_dir2_leaf_log_header(tp, lbp);
+        xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
+        xfs_dir2_leaf_check(dp, lbp);
+        xfs_da_buf_done(lbp);
+        xfs_dir2_data_check(dp, dbp);
+        xfs_da_buf_done(dbp);
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+void
+xfs_dir2_leaf_check(
+        xfs_inode_t             *dp,            /* incore directory inode */
+        xfs_dabuf_t             *bp)            /* leaf's buffer */
+{
+        int                     i;              /* leaf index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail pointer */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     stale;          /* count of stale leaves */
+        leaf = bp->data;
+        mp = dp->i_mount;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+        /*
+         * This value is not restrictive enough.
+         * Should factor in the size of the bests table as well.
+         * We can deduce a value for that from di_size.
+         */
+        ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        /*
+         * Leaves and bests don't overlap.
+         */
+        ASSERT((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <=
+               (char *)XFS_DIR2_LEAF_BESTS_P(ltp));
+        /*
+         * Check hash value order, count stale entries.
+         */
+        for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
+                if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+                        ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
+                               INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
+                if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                        stale++;
+        }
+        ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
+}
+#endif  /* DEBUG */
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir2_leaf_compact(
+        xfs_da_args_t   *args,          /* operation arguments */
+        xfs_dabuf_t     *bp)            /* leaf buffer */
+{
+        int             from;           /* source leaf index */
+        xfs_dir2_leaf_t *leaf;          /* leaf structure */
+        int             loglow;         /* first leaf entry to log */
+        int             to;             /* target leaf index */
+        leaf = bp->data;
+        if (!leaf->hdr.stale) {
+                return;
+        }
+        /*
+         * Compress out the stale entries in place.
+         */
+        for (from = to = 0, loglow = -1; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+                if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Only actually copy the entries that are different.
+                 */
+                if (from > to) {
+                        if (loglow == -1)
+                                loglow = to;
+                        leaf->ents[to] = leaf->ents[from];
+                }
+                to++;
+        }
+        /*
+         * Update and log the header, log the leaf entries.
+         */
+        ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == from - to);
+        INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(INT_GET(leaf->hdr.stale, ARCH_CONVERT)));
+        leaf->hdr.stale = 0;
+        xfs_dir2_leaf_log_header(args->trans, bp);
+        if (loglow != -1)
+                xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+}
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir2_leaf_compact_x1(
+        xfs_dabuf_t     *bp,            /* leaf buffer */
+        int             *indexp,        /* insertion index */
+        int             *lowstalep,     /* out: stale entry before us */
+        int             *highstalep,    /* out: stale entry after us */
+        int             *lowlogp,       /* out: low log index */
+        int             *highlogp)      /* out: high log index */
+{
+        int             from;           /* source copy index */
+        int             highstale;      /* stale entry at/after index */
+        int             index;          /* insertion index */
+        int             keepstale;      /* source index of kept stale */
+        xfs_dir2_leaf_t *leaf;          /* leaf structure */
+        int             lowstale;       /* stale entry before index */
+        int             newindex=0;     /* new insertion index */
+        int             to;             /* destination copy index */
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1);
+        index = *indexp;
+        /*
+         * Find the first stale entry before our index, if any.
+         */
+        for (lowstale = index - 1;
+             lowstale >= 0 &&
+                INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
+             lowstale--)
+                continue;
+        /*
+         * Find the first stale entry at or after our index, if any.
+         * Stop if the answer would be worse than lowstale.
+         */
+        for (highstale = index;
+             highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+                INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
+                (lowstale < 0 || index - lowstale > highstale - index);
+             highstale++)
+                continue;
+        /*
+         * Pick the better of lowstale and highstale.
+         */
+        if (lowstale >= 0 &&
+            (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+             index - lowstale <= highstale - index))
+                keepstale = lowstale;
+        else
+                keepstale = highstale;
+        /*
+         * Copy the entries in place, removing all the stale entries
+         * except keepstale.
+         */
+        for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+                /*
+                 * Notice the new value of index.
+                 */
+                if (index == from)
+                        newindex = to;
+                if (from != keepstale &&
+                    INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
+                        if (from == to)
+                                *lowlogp = to;
+                        continue;
+                }
+                /*
+                 * Record the new keepstale value for the insertion.
+                 */
+                if (from == keepstale)
+                        lowstale = highstale = to;
+                /*
+                 * Copy only the entries that have moved.
+                 */
+                if (from > to)
+                        leaf->ents[to] = leaf->ents[from];
+                to++;
+        }
+        ASSERT(from > to);
+        /*
+         * If the insertion point was past the last entry,
+         * set the new insertion point accordingly.
+         */
+        if (index == from)
+                newindex = to;
+        *indexp = newindex;
+        /*
+         * Adjust the leaf header values.
+         */
+        INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(from - to));
+        INT_SET(leaf->hdr.stale, ARCH_CONVERT, 1);
+        /*
+         * Remember the low/high stale value only in the "right"
+         * direction.
+         */
+        if (lowstale >= newindex)
+                lowstale = -1;
+        else
+                highstale = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+        *highlogp = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
+        *lowstalep = lowstale;
+        *highstalep = highstale;
+}
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+int                                             /* error */
+xfs_dir2_leaf_getdents(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_inode_t             *dp,            /* incore directory inode */
+        uio_t                   *uio,           /* I/O control & vectors */
+        int                     *eofp,          /* out: reached end of dir */
+        xfs_dirent_t            *dbp,           /* caller's buffer */
+        xfs_dir2_put_t          put)            /* ABI formatting routine */
+{
+        xfs_dabuf_t             *bp;            /* data block buffer */
+        int                     byteoff;        /* offset in current block */
+        xfs_dir2_db_t           curdb;          /* db for current block */
+        xfs_dir2_off_t          curoff;         /* current overall offset */
+        xfs_dir2_data_t         *data;          /* data block structure */
+        xfs_dir2_data_entry_t   *dep;           /* data entry */
+        xfs_dir2_data_unused_t  *dup;           /* unused entry */
+        int                     eof;            /* reached end of directory */
+        int                     error=0;                /* error return value */
+        int                     i;              /* temporary loop index */
+        int                     j;              /* temporary loop index */
+        int                     length;         /* temporary length value */
+        xfs_bmbt_irec_t         *map;           /* map vector for blocks */
+        xfs_extlen_t            map_blocks;     /* number of fsbs in map */
+        xfs_dablk_t             map_off;        /* last mapped file offset */
+        int                     map_size;       /* total entries in *map */
+        int                     map_valid;      /* valid entries in *map */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_off_t          newoff;         /* new curoff after new blk */
+        int                     nmap;           /* mappings to ask xfs_bmapi */
+        xfs_dir2_put_args_t     p;              /* formatting arg bundle */
+        char                    *ptr=NULL;              /* pointer to current data */
+        int                     ra_current;     /* number of read-ahead blks */
+        int                     ra_index;       /* *map index for read-ahead */
+        int                     ra_offset;      /* map entry offset for ra */
+        int                     ra_want;        /* readahead count wanted */
+        /*
+         * If the offset is at or past the largest allowed value,
+         * give up right away, return eof.
+         */
+        if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) {
+                *eofp = 1;
+                return 0;
+        }
+        mp = dp->i_mount;
+        /*
+         * Setup formatting arguments.
+         */
+        p.dbp = dbp;
+        p.put = put;
+        p.uio = uio;
+        /*
+         * Set up to bmap a number of blocks based on the caller's
+         * buffer size, the directory block size, and the filesystem
+         * block size.
+         */
+        map_size =
+                howmany(uio->uio_resid + mp->m_dirblksize,
+                        mp->m_sb.sb_blocksize);
+        map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
+        map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
+        bp = NULL;
+        eof = 1;
+        /*
+         * Inside the loop we keep the main offset value as a byte offset
+         * in the directory file.
+         */
+        curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset);
+        /*
+         * Force this conversion through db so we truncate the offset
+         * down to get the start of the data block.
+         */
+        map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff));
+        /*
+         * Loop over directory entries until we reach the end offset.
+         * Get more blocks and readahead as necessary.
+         */
+        while (curoff < XFS_DIR2_LEAF_OFFSET) {
+                /*
+                 * If we have no buffer, or we're off the end of the
+                 * current buffer, need to get another one.
+                 */
+                if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
+                        /*
+                         * If we have a buffer, we need to release it and
+                         * take it out of the mapping.
+                         */
+                        if (bp) {
+                                xfs_da_brelse(tp, bp);
+                                bp = NULL;
+                                map_blocks -= mp->m_dirblkfsbs;
+                                /*
+                                 * Loop to get rid of the extents for the
+                                 * directory block.
+                                 */
+                                for (i = mp->m_dirblkfsbs; i > 0; ) {
+                                        j = MIN((int)map->br_blockcount, i);
+                                        map->br_blockcount -= j;
+                                        map->br_startblock += j;
+                                        map->br_startoff += j;
+                                        /*
+                                         * If mapping is done, pitch it from
+                                         * the table.
+                                         */
+                                        if (!map->br_blockcount && --map_valid)
+                                                memmove(&map[0], &map[1],
+                                                        sizeof(map[0]) *
+                                                        map_valid);
+                                        i -= j;
+                                }
+                        }
+                        /*
+                         * Recalculate the readahead blocks wanted.
+                         */
+                        ra_want = howmany(uio->uio_resid + mp->m_dirblksize,
+                                          mp->m_sb.sb_blocksize) - 1;
+                        /*
+                         * If we don't have as many as we want, and we haven't
+                         * run out of data blocks, get some more mappings.
+                         */
+                        if (1 + ra_want > map_blocks &&
+                            map_off <
+                            XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) {
+                                /*
+                                 * Get more bmaps, fill in after the ones
+                                 * we already have in the table.
+                                 */
+                                nmap = map_size - map_valid;
+                                error = xfs_bmapi(tp, dp,
+                                        map_off,
+                                        XFS_DIR2_BYTE_TO_DA(mp,
+                                                XFS_DIR2_LEAF_OFFSET) - map_off,
+                                        XFS_BMAPI_METADATA, NULL, 0,
+                                        &map[map_valid], &nmap, NULL);
+                                /*
+                                 * Don't know if we should ignore this or
+                                 * try to return an error.
+                                 * The trouble with returning errors
+                                 * is that readdir will just stop without
+                                 * actually passing the error through.
+                                 */
+                                if (error)
+                                        break;  /* XXX */
+                                /*
+                                 * If we got all the mappings we asked for,
+                                 * set the final map offset based on the
+                                 * last bmap value received.
+                                 * Otherwise, we've reached the end.
+                                 */
+                                if (nmap == map_size - map_valid)
+                                        map_off =
+                                        map[map_valid + nmap - 1].br_startoff +
+                                        map[map_valid + nmap - 1].br_blockcount;
+                                else
+                                        map_off =
+                                                XFS_DIR2_BYTE_TO_DA(mp,
+                                                        XFS_DIR2_LEAF_OFFSET);
+                                /*
+                                 * Look for holes in the mapping, and
+                                 * eliminate them.  Count up the valid blocks.
+                                 */
+                                for (i = map_valid; i < map_valid + nmap; ) {
+                                        if (map[i].br_startblock ==
+                                            HOLESTARTBLOCK) {
+                                                nmap--;
+                                                length = map_valid + nmap - i;
+                                                if (length)
+                                                        memmove(&map[i],
+                                                                &map[i + 1],
+                                                                sizeof(map[i]) *
+                                                                length);
+                                        } else {
+                                                map_blocks +=
+                                                        map[i].br_blockcount;
+                                                i++;
+                                        }
+                                }
+                                map_valid += nmap;
+                        }
+                        /*
+                         * No valid mappings, so no more data blocks.
+                         */
+                        if (!map_valid) {
+                                curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off);
+                                break;
+                        }
+                        /*
+                         * Read the directory block starting at the first
+                         * mapping.
+                         */
+                        curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff);
+                        error = xfs_da_read_buf(tp, dp, map->br_startoff,
+                                map->br_blockcount >= mp->m_dirblkfsbs ?
+                                    XFS_FSB_TO_DADDR(mp, map->br_startblock) :
+                                    -1,
+                                &bp, XFS_DATA_FORK);
+                        /*
+                         * Should just skip over the data block instead
+                         * of giving up.
+                         */
+                        if (error)
+                                break;  /* XXX */
+                        /*
+                         * Adjust the current amount of read-ahead: we just
+                         * read a block that was previously ra.
+                         */
+                        if (ra_current)
+                                ra_current -= mp->m_dirblkfsbs;
+                        /*
+                         * Do we need more readahead?
+                         */
+                        for (ra_index = ra_offset = i = 0;
+                             ra_want > ra_current && i < map_blocks;
+                             i += mp->m_dirblkfsbs) {
+                                ASSERT(ra_index < map_valid);
+                                /*
+                                 * Read-ahead a contiguous directory block.
+                                 */
+                                if (i > ra_current &&
+                                    map[ra_index].br_blockcount >=
+                                    mp->m_dirblkfsbs) {
+                                        xfs_baread(mp->m_ddev_targp,
+                                                XFS_FSB_TO_DADDR(mp,
+                                                   map[ra_index].br_startblock +
+                                                   ra_offset),
+                                                (int)BTOBB(mp->m_dirblksize));
+                                        ra_current = i;
+                                }
+                                /*
+                                 * Read-ahead a non-contiguous directory block.
+                                 * This doesn't use our mapping, but this
+                                 * is a very rare case.
+                                 */
+                                else if (i > ra_current) {
+                                        (void)xfs_da_reada_buf(tp, dp,
+                                                map[ra_index].br_startoff +
+                                                ra_offset, XFS_DATA_FORK);
+                                        ra_current = i;
+                                }
+                                /*
+                                 * Advance offset through the mapping table.
+                                 */
+                                for (j = 0; j < mp->m_dirblkfsbs; j++) {
+                                        /*
+                                         * The rest of this extent but not
+                                         * more than a dir block.
+                                         */
+                                        length = MIN(mp->m_dirblkfsbs,
+                                                (int)(map[ra_index].br_blockcount -
+                                                ra_offset));
+                                        j += length;
+                                        ra_offset += length;
+                                        /*
+                                         * Advance to the next mapping if
+                                         * this one is used up.
+                                         */
+                                        if (ra_offset ==
+                                            map[ra_index].br_blockcount) {
+                                                ra_offset = 0;
+                                                ra_index++;
+                                        }
+                                }
+                        }
+                        /*
+                         * Having done a read, we need to set a new offset.
+                         */
+                        newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0);
+                        /*
+                         * Start of the current block.
+                         */
+                        if (curoff < newoff)
+                                curoff = newoff;
+                        /*
+                         * Make sure we're in the right block.
+                         */
+                        else if (curoff > newoff)
+                                ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) ==
+                                       curdb);
+                        data = bp->data;
+                        xfs_dir2_data_check(dp, bp);
+                        /*
+                         * Find our position in the block.
+                         */
+                        ptr = (char *)&data->u;
+                        byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff);
+                        /*
+                         * Skip past the header.
+                         */
+                        if (byteoff == 0)
+                                curoff += (uint)sizeof(data->hdr);
+                        /*
+                         * Skip past entries until we reach our offset.
+                         */
+                        else {
+                                while ((char *)ptr - (char *)data < byteoff) {
+                                        dup = (xfs_dir2_data_unused_t *)ptr;
+                                        if (INT_GET(dup->freetag, ARCH_CONVERT)
+                                                  == XFS_DIR2_DATA_FREE_TAG) {
+                                                length = INT_GET(dup->length,
+                                                                 ARCH_CONVERT);
+                                                ptr += length;
+                                                continue;
+                                        }
+                                        dep = (xfs_dir2_data_entry_t *)ptr;
+                                        length =
+                                           XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+                                        ptr += length;
+                                }
+                                /*
+                                 * Now set our real offset.
+                                 */
+                                curoff =
+                                        XFS_DIR2_DB_OFF_TO_BYTE(mp,
+                                            XFS_DIR2_BYTE_TO_DB(mp, curoff),
+                                            (char *)ptr - (char *)data);
+                                if (ptr >= (char *)data + mp->m_dirblksize) {
+                                        continue;
+                                }
+                        }
+                }
+                /*
+                 * We have a pointer to an entry.
+                 * Is it a live one?
+                 */
+                dup = (xfs_dir2_data_unused_t *)ptr;
+                /*
+                 * No, it's unused, skip over it.
+                 */
+                if (INT_GET(dup->freetag, ARCH_CONVERT)
+                                                == XFS_DIR2_DATA_FREE_TAG) {
+                        length = INT_GET(dup->length, ARCH_CONVERT);
+                        ptr += length;
+                        curoff += length;
+                        continue;
+                }
+                /*
+                 * Copy the entry into the putargs, and try formatting it.
+                 */
+                dep = (xfs_dir2_data_entry_t *)ptr;
+                p.namelen = dep->namelen;
+                length = XFS_DIR2_DATA_ENTSIZE(p.namelen);
+                p.cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length);
+                p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+#if XFS_BIG_INUMS
+                p.ino += mp->m_inoadd;
+#endif
+                p.name = (char *)dep->name;
+                error = p.put(&p);
+                /*
+                 * Won't fit.  Return to caller.
+                 */
+                if (!p.done) {
+                        eof = 0;
+                        break;
+                }
+                /*
+                 * Advance to next entry in the block.
+                 */
+                ptr += length;
+                curoff += length;
+        }
+        /*
+         * All done.  Set output offset value to current offset.
+         */
+        *eofp = eof;
+        if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR))
+                uio->uio_offset = XFS_DIR2_MAX_DATAPTR;
+        else
+                uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff);
+        kmem_free(map, map_size * sizeof(*map));
+        if (bp)
+                xfs_da_brelse(tp, bp);
+        return error;
+}
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+int
+xfs_dir2_leaf_init(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dir2_db_t           bno,            /* directory block number */
+        xfs_dabuf_t             **bpp,          /* out: leaf buffer */
+        int                     magic)          /* magic number for block */
+{
+        xfs_dabuf_t             *bp;            /* leaf buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        tp = args->trans;
+        mp = dp->i_mount;
+        ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
+               bno < XFS_DIR2_FREE_FIRSTDB(mp));
+        /*
+         * Get the buffer for the block.
+         */
+        error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp,
+                XFS_DATA_FORK);
+        if (error) {
+                return error;
+        }
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        /*
+         * Initialize the header.
+         */
+        INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, magic);
+        leaf->hdr.info.forw = 0;
+        leaf->hdr.info.back = 0;
+        leaf->hdr.count = 0;
+        leaf->hdr.stale = 0;
+        xfs_dir2_leaf_log_header(tp, bp);
+        /*
+         * If it's a leaf-format directory initialize the tail.
+         * In this case our caller has the real bests table to copy into
+         * the block.
+         */
+        if (magic == XFS_DIR2_LEAF1_MAGIC) {
+                ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+                ltp->bestcount = 0;
+                xfs_dir2_leaf_log_tail(tp, bp);
+        }
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+void
+xfs_dir2_leaf_log_bests(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp,            /* leaf buffer */
+        int                     first,          /* first entry to log */
+        int                     last)           /* last entry to log */
+{
+        xfs_dir2_data_off_t     *firstb;        /* pointer to first entry */
+        xfs_dir2_data_off_t     *lastb;         /* pointer to last entry */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+        ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf);
+        firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first;
+        lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last;
+        xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+                (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_ents(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp,            /* leaf buffer */
+        int                     first,          /* first entry to log */
+        int                     last)           /* last entry to log */
+{
+        xfs_dir2_leaf_entry_t   *firstlep;      /* pointer to first entry */
+        xfs_dir2_leaf_entry_t   *lastlep;       /* pointer to last entry */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
+               INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        firstlep = &leaf->ents[first];
+        lastlep = &leaf->ents[last];
+        xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+                (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_header(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp)            /* leaf buffer */
+{
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
+               INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
+                (uint)(sizeof(leaf->hdr) - 1));
+}
+/*
+ * Log the tail of the leaf1 block.
+ */
+void
+xfs_dir2_leaf_log_tail(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp)            /* leaf buffer */
+{
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        mp = tp->t_mountp;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
+                (uint)(mp->m_dirblksize - 1));
+}
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_dabuf_t             *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        int                     index;          /* found entry index */
+        xfs_dabuf_t             *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args("leaf_lookup", args);
+        /*
+         * Look up name in the leaf block, returning both buffers and index.
+         */
+        if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+                return error;
+        }
+        tp = args->trans;
+        dp = args->dp;
+        xfs_dir2_leaf_check(dp, lbp);
+        leaf = lbp->data;
+        /*
+         * Get to the leaf entry and contained data entry address.
+         */
+        lep = &leaf->ents[index];
+        /*
+         * Point to the data entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)dbp->data +
+               XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
+        /*
+         * Return the found inode number.
+         */
+        args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+        xfs_da_brelse(tp, dbp);
+        xfs_da_brelse(tp, lbp);
+        return XFS_ERROR(EEXIST);
+}
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int                                      /* error */
+xfs_dir2_leaf_lookup_int(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             **lbpp,         /* out: leaf buffer */
+        int                     *indexp,        /* out: index in leaf block */
+        xfs_dabuf_t             **dbpp)         /* out: data buffer */
+{
+        xfs_dir2_db_t           curdb;          /* current data block number */
+        xfs_dabuf_t             *dbp;           /* data buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        int                     index;          /* index in leaf block */
+        xfs_dabuf_t             *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_db_t           newdb;          /* new data block number */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        /*
+         * Read the leaf block into the buffer.
+         */
+        if ((error =
+            xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+                    XFS_DATA_FORK))) {
+                return error;
+        }
+        *lbpp = lbp;
+        leaf = lbp->data;
+        xfs_dir2_leaf_check(dp, lbp);
+        /*
+         * Look for the first leaf entry with our hash value.
+         */
+        index = xfs_dir2_leaf_search_hash(args, lbp);
+        /*
+         * Loop over all the entries with the right hash value
+         * looking to match the name.
+         */
+        for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
+             index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+             lep++, index++) {
+                /*
+                 * Skip over stale leaf entries.
+                 */
+                if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Get the new data block number.
+                 */
+                newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+                /*
+                 * If it's not the same as the old data block number,
+                 * need to pitch the old one and read the new one.
+                 */
+                if (newdb != curdb) {
+                        if (dbp)
+                                xfs_da_brelse(tp, dbp);
+                        if ((error =
+                            xfs_da_read_buf(tp, dp,
+                                    XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp,
+                                    XFS_DATA_FORK))) {
+                                xfs_da_brelse(tp, lbp);
+                                return error;
+                        }
+                        xfs_dir2_data_check(dp, dbp);
+                        curdb = newdb;
+                }
+                /*
+                 * Point to the data entry.
+                 */
+                dep = (xfs_dir2_data_entry_t *)
+                      ((char *)dbp->data +
+                       XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+                /*
+                 * If it matches then return it.
+                 */
+                if (dep->namelen == args->namelen &&
+                    dep->name[0] == args->name[0] &&
+                    memcmp(dep->name, args->name, args->namelen) == 0) {
+                        *dbpp = dbp;
+                        *indexp = index;
+                        return 0;
+                }
+        }
+        /*
+         * No match found, return ENOENT.
+         */
+        ASSERT(args->oknoent);
+        if (dbp)
+                xfs_da_brelse(tp, dbp);
+        xfs_da_brelse(tp, lbp);
+        return XFS_ERROR(ENOENT);
+}
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int                                             /* error */
+xfs_dir2_leaf_removename(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_dir2_data_off_t     *bestsp;        /* leaf block best freespace */
+        xfs_dir2_data_t         *data;          /* data block structure */
+        xfs_dir2_db_t           db;             /* data block number */
+        xfs_dabuf_t             *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry structure */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        xfs_dir2_db_t           i;              /* temporary data block # */
+        int                     index;          /* index into leaf entries */
+        xfs_dabuf_t             *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data frees */
+        xfs_dir2_data_off_t     oldbest;        /* old value of best free */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args("leaf_removename", args);
+        /*
+         * Lookup the leaf entry, get the leaf and data blocks read in.
+         */
+        if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+                return error;
+        }
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = lbp->data;
+        data = dbp->data;
+        xfs_dir2_data_check(dp, dbp);
+        /*
+         * Point to the leaf entry, use that to point to the data entry.
+         */
+        lep = &leaf->ents[index];
+        db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+        needscan = needlog = 0;
+        oldbest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
+        ASSERT(INT_GET(bestsp[db], ARCH_CONVERT) == oldbest);
+        /*
+         * Mark the former data entry unused.
+         */
+        xfs_dir2_data_make_free(tp, dbp,
+                (xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
+                XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+        /*
+         * We just mark the leaf entry stale by putting a null in it.
+         */
+        INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
+        xfs_dir2_leaf_log_header(tp, lbp);
+        INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+        xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+        /*
+         * Scan the freespace in the data block again if necessary,
+         * log the data block header if necessary.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+        if (needlog)
+                xfs_dir2_data_log_header(tp, dbp);
+        /*
+         * If the longest freespace in the data block has changed,
+         * put the new value in the bests table and log that.
+         */
+        if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) != oldbest) {
+                INT_COPY(bestsp[db], data->hdr.bestfree[0].length, ARCH_CONVERT);
+                xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+        }
+        xfs_dir2_data_check(dp, dbp);
+        /*
+         * If the data block is now empty then get rid of the data block.
+         */
+        if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
+            mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+                ASSERT(db != mp->m_dirdatablk);
+                if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+                        /*
+                         * Nope, can't get rid of it because it caused
+                         * allocation of a bmap btree block to do so.
+                         * Just go on, returning success, leaving the
+                         * empty block in place.
+                         */
+                        if (error == ENOSPC && args->total == 0) {
+                                xfs_da_buf_done(dbp);
+                                error = 0;
+                        }
+                        xfs_dir2_leaf_check(dp, lbp);
+                        xfs_da_buf_done(lbp);
+                        return error;
+                }
+                dbp = NULL;
+                /*
+                 * If this is the last data block then compact the
+                 * bests table by getting rid of entries.
+                 */
+                if (db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1) {
+                        /*
+                         * Look for the last active entry (i).
+                         */
+                        for (i = db - 1; i > 0; i--) {
+                                if (INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF)
+                                        break;
+                        }
+                        /*
+                         * Copy the table down so inactive entries at the
+                         * end are removed.
+                         */
+                        memmove(&bestsp[db - i], bestsp,
+                                (INT_GET(ltp->bestcount, ARCH_CONVERT) - (db - i)) * sizeof(*bestsp));
+                        INT_MOD(ltp->bestcount, ARCH_CONVERT, -(db - i));
+                        xfs_dir2_leaf_log_tail(tp, lbp);
+                        xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+                } else
+                        INT_SET(bestsp[db], ARCH_CONVERT, NULLDATAOFF);
+        }
+        /*
+         * If the data block was not the first one, drop it.
+         */
+        else if (db != mp->m_dirdatablk && dbp != NULL) {
+                xfs_da_buf_done(dbp);
+                dbp = NULL;
+        }
+        xfs_dir2_leaf_check(dp, lbp);
+        /*
+         * See if we can convert to block form.
+         */
+        return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int                                             /* error */
+xfs_dir2_leaf_replace(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_dabuf_t             *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        int                     index;          /* index of leaf entry */
+        xfs_dabuf_t             *lbp;           /* leaf buffer */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args("leaf_replace", args);
+        /*
+         * Look up the entry.
+         */
+        if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+                return error;
+        }
+        dp = args->dp;
+        leaf = lbp->data;
+        /*
+         * Point to the leaf entry, get data address from it.
+         */
+        lep = &leaf->ents[index];
+        /*
+         * Point to the data entry.
+         */
+        dep = (xfs_dir2_data_entry_t *)
+              ((char *)dbp->data +
+               XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
+        ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
+        /*
+         * Put the new inode number in, log it.
+         */
+        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        tp = args->trans;
+        xfs_dir2_data_log_entry(tp, dbp, dep);
+        xfs_da_buf_done(dbp);
+        xfs_dir2_leaf_check(dp, lbp);
+        xfs_da_brelse(tp, lbp);
+        return 0;
+}
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int                                             /* index value */
+xfs_dir2_leaf_search_hash(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             *lbp)           /* leaf buffer */
+{
+        xfs_dahash_t            hash=0;         /* hash from this entry */
+        xfs_dahash_t            hashwant;       /* hash value looking for */
+        int                     high;           /* high leaf index */
+        int                     low;            /* low leaf index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        int                     mid=0;          /* current leaf index */
+        leaf = lbp->data;
+#ifndef __KERNEL__
+        if (!leaf->hdr.count)
+                return 0;
+#endif
+        /*
+         * Note, the table cannot be empty, so we have to go through the loop.
+         * Binary search the leaf entries looking for our hash value.
+         */
+        for (lep = leaf->ents, low = 0, high = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1,
+                hashwant = args->hashval;
+             low <= high; ) {
+                mid = (low + high) >> 1;
+                if ((hash = INT_GET(lep[mid].hashval, ARCH_CONVERT)) == hashwant)
+                        break;
+                if (hash < hashwant)
+                        low = mid + 1;
+                else
+                        high = mid - 1;
+        }
+        /*
+         * Found one, back up through all the equal hash values.
+         */
+        if (hash == hashwant) {
+                while (mid > 0 && INT_GET(lep[mid - 1].hashval, ARCH_CONVERT) == hashwant) {
+                        mid--;
+                }
+        }
+        /*
+         * Need to point to an entry higher than ours.
+         */
+        else if (hash < hashwant)
+                mid++;
+        return mid;
+}
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int                                             /* error */
+xfs_dir2_leaf_trim_data(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             *lbp,           /* leaf buffer */
+        xfs_dir2_db_t           db)             /* data block number */
+{
+        xfs_dir2_data_off_t     *bestsp;        /* leaf bests table */
+#ifdef DEBUG
+        xfs_dir2_data_t         *data;          /* data block structure */
+#endif
+        xfs_dabuf_t             *dbp;           /* data block buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Read the offending data block.  We need its buffer.
+         */
+        if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp,
+                        XFS_DATA_FORK))) {
+                return error;
+        }
+#ifdef DEBUG
+        data = dbp->data;
+        ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+#endif
+        /* this seems to be an error
+         * data is only valid if DEBUG is defined?
+         * RMC 09/08/1999
+         */
+        leaf = lbp->data;
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
+               mp->m_dirblksize - (uint)sizeof(data->hdr));
+        ASSERT(db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+        /*
+         * Get rid of the data block.
+         */
+        if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+                ASSERT(error != ENOSPC);
+                xfs_da_brelse(tp, dbp);
+                return error;
+        }
+        /*
+         * Eliminate the last bests entry from the table.
+         */
+        bestsp = XFS_DIR2_LEAF_BESTS_P(ltp);
+        INT_MOD(ltp->bestcount, ARCH_CONVERT, -1);
+        memmove(&bestsp[1], &bestsp[0], INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(*bestsp));
+        xfs_dir2_leaf_log_tail(tp, lbp);
+        xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+        return 0;
+}
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int                                             /* error */
+xfs_dir2_node_to_leaf(
+        xfs_da_state_t          *state)         /* directory operation state */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        xfs_dabuf_t             *fbp;           /* buffer for freespace block */
+        xfs_fileoff_t           fo;             /* freespace file offset */
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        xfs_dabuf_t             *lbp;           /* buffer for leaf block */
+        xfs_dir2_leaf_tail_t    *ltp;           /* tail of leaf structure */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     rval;           /* successful free trim? */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        /*
+         * There's more than a leaf level in the btree, so there must
+         * be multiple leafn blocks.  Give up.
+         */
+        if (state->path.active > 1)
+                return 0;
+        args = state->args;
+        xfs_dir2_trace_args("node_to_leaf", args);
+        mp = state->mp;
+        dp = args->dp;
+        tp = args->trans;
+        /*
+         * Get the last offset in the file.
+         */
+        if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+                return error;
+        }
+        fo -= mp->m_dirblkfsbs;
+        /*
+         * If there are freespace blocks other than the first one,
+         * take this opportunity to remove trailing empty freespace blocks
+         * that may have been left behind during no-space-reservation
+         * operations.
+         */
+        while (fo > mp->m_dirfreeblk) {
+                if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+                        return error;
+                }
+                if (rval)
+                        fo -= mp->m_dirblkfsbs;
+                else
+                        return 0;
+        }
+        /*
+         * Now find the block just before the freespace block.
+         */
+        if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+                return error;
+        }
+        /*
+         * If it's not the single leaf block, give up.
+         */
+        if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+                return 0;
+        lbp = state->path.blk[0].bp;
+        leaf = lbp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Read the freespace block.
+         */
+        if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
+                        XFS_DATA_FORK))) {
+                return error;
+        }
+        free = fbp->data;
+        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+        ASSERT(!free->hdr.firstdb);
+        /*
+         * Now see if the leafn and free data will fit in a leaf1.
+         * If not, release the buffer and give up.
+         */
+        if ((uint)sizeof(leaf->hdr) +
+            (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)) * (uint)sizeof(leaf->ents[0]) +
+            INT_GET(free->hdr.nvalid, ARCH_CONVERT) * (uint)sizeof(leaf->bests[0]) +
+            (uint)sizeof(leaf->tail) >
+            mp->m_dirblksize) {
+                xfs_da_brelse(tp, fbp);
+                return 0;
+        }
+        /*
+         * If the leaf has any stale entries in it, compress them out.
+         * The compact routine will log the header.
+         */
+        if (INT_GET(leaf->hdr.stale, ARCH_CONVERT))
+                xfs_dir2_leaf_compact(args, lbp);
+        else
+                xfs_dir2_leaf_log_header(tp, lbp);
+        INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAF1_MAGIC);
+        /*
+         * Set up the leaf tail from the freespace block.
+         */
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        INT_COPY(ltp->bestcount, free->hdr.nvalid, ARCH_CONVERT);
+        /*
+         * Set up the leaf bests table.
+         */
+        memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests,
+                INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(leaf->bests[0]));
+        xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+        xfs_dir2_leaf_log_tail(tp, lbp);
+        xfs_dir2_leaf_check(dp, lbp);
+        /*
+         * Get rid of the freespace block.
+         */
+        error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+        if (error) {
+                /*
+                 * This can't fail here because it can only happen when
+                 * punching out the middle of an extent, and this is an
+                 * isolated block.
+                 */
+                ASSERT(error != ENOSPC);
+                return error;
+        }
+        fbp = NULL;
+        /*
+         * Now see if we can convert the single-leaf directory
+         * down to a block form directory.
+         * This routine always kills the dabuf for the leaf, so
+         * eliminate it from the path.
+         */
+        error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+        state->path.blk[0].bp = NULL;
+        return error;
+}
diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h
new file mode 100644
index 000000000000..7f20eee56a52
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_LEAF_H__
+#define __XFS_DIR2_LEAF_H__
+/*
+ * Directory version 2, leaf block structures.
+ */
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * Constants.
+ */
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE     1
+#define XFS_DIR2_LEAF_OFFSET    (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_LEAF_FIRSTDB(mp)       \
+        XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET)
+/*
+ * Types.
+ */
+/*
+ * Offset in data space of a data entry.
+ */
+typedef __uint32_t      xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR    ((xfs_dir2_dataptr_t)0xffffffff)
+#define XFS_DIR2_NULL_DATAPTR   ((xfs_dir2_dataptr_t)0)
+/*
+ * Structures.
+ */
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+        xfs_da_blkinfo_t        info;           /* header for da routines */
+        __uint16_t              count;          /* count of entries */
+        __uint16_t              stale;          /* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+        xfs_dahash_t            hashval;        /* hash value of name */
+        xfs_dir2_dataptr_t      address;        /* address of data entry */
+} xfs_dir2_leaf_entry_t;
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+        __uint32_t              bestcount;
+} xfs_dir2_leaf_tail_t;
+/*
+ * Leaf block.
+ * bests and tail are at the end of the block for single-leaf only
+ * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC).
+ */
+typedef struct xfs_dir2_leaf {
+        xfs_dir2_leaf_hdr_t     hdr;            /* leaf header */
+        xfs_dir2_leaf_entry_t   ents[1];        /* entries */
+                                                /* ... */
+        xfs_dir2_data_off_t     bests[1];       /* best free counts */
+        xfs_dir2_leaf_tail_t    tail;           /* leaf tail */
+} xfs_dir2_leaf_t;
+/*
+ * Macros.
+ * The DB blocks are logical directory block numbers, not filesystem blocks.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
+int
+xfs_dir2_max_leaf_ents(struct xfs_mount *mp);
+#define XFS_DIR2_MAX_LEAF_ENTS(mp)      \
+        xfs_dir2_max_leaf_ents(mp)
+#else
+#define XFS_DIR2_MAX_LEAF_ENTS(mp)      \
+        ((int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / \
+               (uint)sizeof(xfs_dir2_leaf_entry_t)))
+#endif
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_TAIL_P)
+xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp);
+#define XFS_DIR2_LEAF_TAIL_P(mp,lp)     \
+        xfs_dir2_leaf_tail_p(mp, lp)
+#else
+#define XFS_DIR2_LEAF_TAIL_P(mp,lp)     \
+        ((xfs_dir2_leaf_tail_t *)\
+         ((char *)(lp) + (mp)->m_dirblksize - \
+          (uint)sizeof(xfs_dir2_leaf_tail_t)))
+#endif
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_BESTS_P)
+xfs_dir2_data_off_t *
+xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp);
+#define XFS_DIR2_LEAF_BESTS_P(ltp)      xfs_dir2_leaf_bests_p(ltp)
+#else
+#define XFS_DIR2_LEAF_BESTS_P(ltp)      \
+        ((xfs_dir2_data_off_t *)(ltp) - INT_GET((ltp)->bestcount, ARCH_CONVERT))
+#endif
+/*
+ * Convert dataptr to byte in file space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) \
+        ((xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG)
+#endif
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by)
+#else
+#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) \
+        ((xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG))
+#endif
+/*
+ * Convert dataptr to a block number
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_DB(mp,dp)   xfs_dir2_dataptr_to_db(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_DB(mp,dp)   \
+        XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
+#endif
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp)  xfs_dir2_dataptr_to_off(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp)  \
+        XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
+#endif
+/*
+ * Convert block and offset to byte in space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
+                        xfs_dir2_data_aoff_t o);
+#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o)        \
+        xfs_dir2_db_off_to_byte(mp, db, o)
+#else
+#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o)        \
+        (((xfs_dir2_off_t)(db) << \
+         ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o))
+#endif
+/*
+ * Convert byte in space to (DB) block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DB)
+xfs_dir2_db_t xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DB(mp,by)      xfs_dir2_byte_to_db(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_DB(mp,by)      \
+        ((xfs_dir2_db_t)((by) >> \
+                         ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)))
+#endif
+/*
+ * Convert byte in space to (DA) block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DA)
+xfs_dablk_t xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DA(mp,by)      xfs_dir2_byte_to_da(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_DA(mp,by)      \
+        XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by))
+#endif
+/*
+ * Convert byte in space to offset in a block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_OFF(mp,by)     xfs_dir2_byte_to_off(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_OFF(mp,by)     \
+        ((xfs_dir2_data_aoff_t)((by) & \
+                                ((1 << ((mp)->m_sb.sb_blocklog + \
+                                        (mp)->m_sb.sb_dirblklog)) - 1)))
+#endif
+/*
+ * Convert block and offset to dataptr
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
+                           xfs_dir2_data_aoff_t o);
+#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o)     \
+        xfs_dir2_db_off_to_dataptr(mp, db, o)
+#else
+#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o)     \
+        XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o))
+#endif
+/*
+ * Convert block (DB) to block (dablk)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_DA)
+xfs_dablk_t xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_DA(mp,db)        xfs_dir2_db_to_da(mp, db)
+#else
+#define XFS_DIR2_DB_TO_DA(mp,db)        \
+        ((xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog))
+#endif
+/*
+ * Convert block (dablk) to block (DB)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_DB)
+xfs_dir2_db_t xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da);
+#define XFS_DIR2_DA_TO_DB(mp,da)        xfs_dir2_da_to_db(mp, da)
+#else
+#define XFS_DIR2_DA_TO_DB(mp,da)        \
+        ((xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog))
+#endif
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_BYTE)
+xfs_dir2_off_t xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da);
+#define XFS_DIR2_DA_TO_BYTE(mp,da)      xfs_dir2_da_to_byte(mp, da)
+#else
+#define XFS_DIR2_DA_TO_BYTE(mp,da)      \
+        XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0)
+#endif
+/*
+ * Function declarations.
+ */
+extern int
+        xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_dabuf *dbp);
+extern int
+        xfs_dir2_leaf_addname(struct xfs_da_args *args);
+extern void
+        xfs_dir2_leaf_compact(struct xfs_da_args *args, struct xfs_dabuf *bp);
+extern void
+        xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
+                                 int *lowstalep, int *highstalep, int *lowlogp,
+                                 int *highlogp);
+extern int
+        xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+                               struct uio *uio, int *eofp, struct xfs_dirent *dbp,
+                               xfs_dir2_put_t put);
+extern int
+        xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
+                           struct xfs_dabuf **bpp, int magic);
+extern void
+        xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
+                               int first, int last);
+extern void
+        xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+                                int first, int last);
+extern void
+        xfs_dir2_leaf_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
+extern void
+        xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
+extern int
+        xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+extern int
+        xfs_dir2_leaf_removename(struct xfs_da_args *args);
+extern int
+        xfs_dir2_leaf_replace(struct xfs_da_args *args);
+extern int
+        xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+                                  struct xfs_dabuf *lbp);
+extern int
+        xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_dabuf *lbp,                                xfs_dir2_db_t db);
+extern int
+        xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+#endif  /* __XFS_DIR2_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
new file mode 100644
index 000000000000..a7615d86bfb7
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.c
@@ -0,0 +1,2020 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_dir2_node.c
+ * XFS directory implementation, version 2, node form files
+ * See data structures in xfs_dir2_node.h and xfs_da_btree.h.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_error.h"
+/*
+ * Function declarations.
+ */
+static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
+#ifdef DEBUG
+static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leafn_check(dp, bp)
+#endif
+static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
+                                    int start_s, xfs_dabuf_t *bp_d, int start_d,
+                                    int count);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+                                     xfs_da_state_blk_t *blk1,
+                                     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
+                                 int index, xfs_da_state_blk_t *dblk,
+                                 int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+                                     xfs_da_state_blk_t *fblk);
+/*
+ * Log entries from a freespace block.
+ */
+void
+xfs_dir2_free_log_bests(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp,            /* freespace buffer */
+        int                     first,          /* first entry to log */
+        int                     last)           /* last entry to log */
+{
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        free = bp->data;
+        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+        xfs_da_log_buf(tp, bp,
+                (uint)((char *)&free->bests[first] - (char *)free),
+                (uint)((char *)&free->bests[last] - (char *)free +
+                       sizeof(free->bests[0]) - 1));
+}
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+        xfs_trans_t             *tp,            /* transaction pointer */
+        xfs_dabuf_t             *bp)            /* freespace buffer */
+{
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        free = bp->data;
+        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+        xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
+                (uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+}
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int                                             /* error */
+xfs_dir2_leaf_to_node(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             *lbp)           /* leaf buffer */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        xfs_dabuf_t             *fbp;           /* freespace buffer */
+        xfs_dir2_db_t           fdb;            /* freespace block number */
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        xfs_dir2_data_off_t     *from;          /* pointer to freespace entry */
+        int                     i;              /* leaf freespace index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_tail_t    *ltp;           /* leaf tail structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     n;              /* count of live freespc ents */
+        xfs_dir2_data_off_t     off;            /* freespace entry value */
+        xfs_dir2_data_off_t     *to;            /* pointer to freespace entry */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args_b("leaf_to_node", args, lbp);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Add a freespace block to the directory.
+         */
+        if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+                return error;
+        }
+        ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+        /*
+         * Get the buffer for the new freespace block.
+         */
+        if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp,
+                        XFS_DATA_FORK))) {
+                return error;
+        }
+        ASSERT(fbp != NULL);
+        free = fbp->data;
+        leaf = lbp->data;
+        ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+        /*
+         * Initialize the freespace block header.
+         */
+        INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
+        free->hdr.firstdb = 0;
+        ASSERT(INT_GET(ltp->bestcount, ARCH_CONVERT) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
+        INT_COPY(free->hdr.nvalid, ltp->bestcount, ARCH_CONVERT);
+        /*
+         * Copy freespace entries from the leaf block to the new block.
+         * Count active entries.
+         */
+        for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests;
+             i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++, from++, to++) {
+                if ((off = INT_GET(*from, ARCH_CONVERT)) != NULLDATAOFF)
+                        n++;
+                INT_SET(*to, ARCH_CONVERT, off);
+        }
+        INT_SET(free->hdr.nused, ARCH_CONVERT, n);
+        INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Log everything.
+         */
+        xfs_dir2_leaf_log_header(tp, lbp);
+        xfs_dir2_free_log_header(tp, fbp);
+        xfs_dir2_free_log_bests(tp, fbp, 0, INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1);
+        xfs_da_buf_done(fbp);
+        xfs_dir2_leafn_check(dp, lbp);
+        return 0;
+}
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int                                      /* error */
+xfs_dir2_leafn_add(
+        xfs_dabuf_t             *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     index)          /* insertion pt for new entry */
+{
+        int                     compact;        /* compacting stale leaves */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     highstale;      /* next stale entry */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        int                     lfloghigh;      /* high leaf entry logging */
+        int                     lfloglow;       /* low leaf entry logging */
+        int                     lowstale;       /* previous stale entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args_sb("leafn_add", args, index, bp);
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        leaf = bp->data;
+        /*
+         * Quick check just to make sure we are not going to index
+         * into other peoples memory
+         */
+        if (index < 0)
+                return XFS_ERROR(EFSCORRUPTED);
+        /*
+         * If there are already the maximum number of leaf entries in
+         * the block, if there are no stale entries it won't fit.
+         * Caller will do a split.  If there are stale entries we'll do
+         * a compact.
+         */
+        if (INT_GET(leaf->hdr.count, ARCH_CONVERT) == XFS_DIR2_MAX_LEAF_ENTS(mp)) {
+                if (!leaf->hdr.stale)
+                        return XFS_ERROR(ENOSPC);
+                compact = INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1;
+        } else
+                compact = 0;
+        ASSERT(index == 0 || INT_GET(leaf->ents[index - 1].hashval, ARCH_CONVERT) <= args->hashval);
+        ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+               INT_GET(leaf->ents[index].hashval, ARCH_CONVERT) >= args->hashval);
+        if (args->justcheck)
+                return 0;
+        /*
+         * Compact out all but one stale leaf entry.  Leaves behind
+         * the entry closest to index.
+         */
+        if (compact) {
+                xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
+                        &lfloglow, &lfloghigh);
+        }
+        /*
+         * Set impossible logging indices for this case.
+         */
+        else if (leaf->hdr.stale) {
+                lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                lfloghigh = -1;
+        }
+        /*
+         * No stale entries, just insert a space for the new entry.
+         */
+        if (!leaf->hdr.stale) {
+                lep = &leaf->ents[index];
+                if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+                        memmove(lep + 1, lep,
+                                (INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
+                lfloglow = index;
+                lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
+        }
+        /*
+         * There are stale entries.  We'll use one for the new entry.
+         */
+        else {
+                /*
+                 * If we didn't do a compact then we need to figure out
+                 * which stale entry will be used.
+                 */
+                if (compact == 0) {
+                        /*
+                         * Find first stale entry before our insertion point.
+                         */
+                        for (lowstale = index - 1;
+                             lowstale >= 0 &&
+                                INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
+                                XFS_DIR2_NULL_DATAPTR;
+                             lowstale--)
+                                continue;
+                        /*
+                         * Find next stale entry after insertion point.
+                         * Stop looking if the answer would be worse than
+                         * lowstale already found.
+                         */
+                        for (highstale = index;
+                             highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+                                INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
+                                XFS_DIR2_NULL_DATAPTR &&
+                                (lowstale < 0 ||
+                                 index - lowstale - 1 >= highstale - index);
+                             highstale++)
+                                continue;
+                }
+                /*
+                 * Using the low stale entry.
+                 * Shift entries up toward the stale slot.
+                 */
+                if (lowstale >= 0 &&
+                    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+                     index - lowstale - 1 < highstale - index)) {
+                        ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
+                               XFS_DIR2_NULL_DATAPTR);
+                        ASSERT(index - lowstale - 1 >= 0);
+                        if (index - lowstale - 1 > 0)
+                                memmove(&leaf->ents[lowstale],
+                                        &leaf->ents[lowstale + 1],
+                                        (index - lowstale - 1) * sizeof(*lep));
+                        lep = &leaf->ents[index - 1];
+                        lfloglow = MIN(lowstale, lfloglow);
+                        lfloghigh = MAX(index - 1, lfloghigh);
+                }
+                /*
+                 * Using the high stale entry.
+                 * Shift entries down toward the stale slot.
+                 */
+                else {
+                        ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
+                               XFS_DIR2_NULL_DATAPTR);
+                        ASSERT(highstale - index >= 0);
+                        if (highstale - index > 0)
+                                memmove(&leaf->ents[index + 1],
+                                        &leaf->ents[index],
+                                        (highstale - index) * sizeof(*lep));
+                        lep = &leaf->ents[index];
+                        lfloglow = MIN(index, lfloglow);
+                        lfloghigh = MAX(highstale, lfloghigh);
+                }
+                INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+        }
+        /*
+         * Insert the new entry, log everything.
+         */
+        INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
+        INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, args->blkno, args->index));
+        xfs_dir2_leaf_log_header(tp, bp);
+        xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
+        xfs_dir2_leafn_check(dp, bp);
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check internal consistency of a leafn block.
+ */
+void
+xfs_dir2_leafn_check(
+        xfs_inode_t     *dp,                    /* incore directory inode */
+        xfs_dabuf_t     *bp)                    /* leaf buffer */
+{
+        int             i;                      /* leaf index */
+        xfs_dir2_leaf_t *leaf;                  /* leaf structure */
+        xfs_mount_t     *mp;                    /* filesystem mount point */
+        int             stale;                  /* count of stale leaves */
+        leaf = bp->data;
+        mp = dp->i_mount;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
+        for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
+                if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+                        ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
+                               INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
+                }
+                if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                        stale++;
+        }
+        ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
+}
+#endif  /* DEBUG */
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t                                    /* hash value */
+xfs_dir2_leafn_lasthash(
+        xfs_dabuf_t     *bp,                    /* leaf buffer */
+        int             *count)                 /* count of entries in leaf */
+{
+        xfs_dir2_leaf_t *leaf;                  /* leaf structure */
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        if (count)
+                *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+        if (!leaf->hdr.count)
+                return 0;
+        return INT_GET(leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+}
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+        xfs_dabuf_t             *bp,            /* leaf buffer */
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     *indexp,        /* out: leaf entry index */
+        xfs_da_state_t          *state)         /* state to fill in */
+{
+        xfs_dabuf_t             *curbp;         /* current data/free buffer */
+        xfs_dir2_db_t           curdb;          /* current data block number */
+        xfs_dir2_db_t           curfdb;         /* current free block number */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        int                     fi;             /* free entry index */
+        xfs_dir2_free_t         *free=NULL;     /* free block structure */
+        int                     index;          /* leaf entry index */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        int                     length=0;       /* length of new data entry */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_db_t           newdb;          /* new data block number */
+        xfs_dir2_db_t           newfdb;         /* new free block number */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+        ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) > 0);
+#endif
+        xfs_dir2_leafn_check(dp, bp);
+        /*
+         * Look up the hash value in the leaf entries.
+         */
+        index = xfs_dir2_leaf_search_hash(args, bp);
+        /*
+         * Do we have a buffer coming in?
+         */
+        if (state->extravalid)
+                curbp = state->extrablk.bp;
+        else
+                curbp = NULL;
+        /*
+         * For addname, it's a free block buffer, get the block number.
+         */
+        if (args->addname) {
+                curfdb = curbp ? state->extrablk.blkno : -1;
+                curdb = -1;
+                length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+                if ((free = (curbp ? curbp->data : NULL)))
+                        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+        }
+        /*
+         * For others, it's a data block buffer, get the block number.
+         */
+        else {
+                curfdb = -1;
+                curdb = curbp ? state->extrablk.blkno : -1;
+        }
+        /*
+         * Loop over leaf entries with the right hash value.
+         */
+        for (lep = &leaf->ents[index];
+             index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+             lep++, index++) {
+                /*
+                 * Skip stale leaf entries.
+                 */
+                if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Pull the data block number from the entry.
+                 */
+                newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+                /*
+                 * For addname, we're looking for a place to put the new entry.
+                 * We want to use a data block with an entry of equal
+                 * hash value to ours if there is one with room.
+                 */
+                if (args->addname) {
+                        /*
+                         * If this block isn't the data block we already have
+                         * in hand, take a look at it.
+                         */
+                        if (newdb != curdb) {
+                                curdb = newdb;
+                                /*
+                                 * Convert the data block to the free block
+                                 * holding its freespace information.
+                                 */
+                                newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb);
+                                /*
+                                 * If it's not the one we have in hand,
+                                 * read it in.
+                                 */
+                                if (newfdb != curfdb) {
+                                        /*
+                                         * If we had one before, drop it.
+                                         */
+                                        if (curbp)
+                                                xfs_da_brelse(tp, curbp);
+                                        /*
+                                         * Read the free block.
+                                         */
+                                        if ((error = xfs_da_read_buf(tp, dp,
+                                                        XFS_DIR2_DB_TO_DA(mp,
+                                                                newfdb),
+                                                        -1, &curbp,
+                                                        XFS_DATA_FORK))) {
+                                                return error;
+                                        }
+                                        curfdb = newfdb;
+                                        free = curbp->data;
+                                        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) ==
+                                               XFS_DIR2_FREE_MAGIC);
+                                        ASSERT((INT_GET(free->hdr.firstdb, ARCH_CONVERT) %
+                                                XFS_DIR2_MAX_FREE_BESTS(mp)) ==
+                                               0);
+                                        ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) <= curdb);
+                                        ASSERT(curdb <
+                                               INT_GET(free->hdr.firstdb, ARCH_CONVERT) +
+                                               INT_GET(free->hdr.nvalid, ARCH_CONVERT));
+                                }
+                                /*
+                                 * Get the index for our entry.
+                                 */
+                                fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb);
+                                /*
+                                 * If it has room, return it.
+                                 */
+                                if (unlikely(INT_GET(free->bests[fi], ARCH_CONVERT) == NULLDATAOFF)) {
+                                        XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+                                                         XFS_ERRLEVEL_LOW, mp);
+                                        return XFS_ERROR(EFSCORRUPTED);
+                                }
+                                if (INT_GET(free->bests[fi], ARCH_CONVERT) >= length) {
+                                        *indexp = index;
+                                        state->extravalid = 1;
+                                        state->extrablk.bp = curbp;
+                                        state->extrablk.blkno = curfdb;
+                                        state->extrablk.index = fi;
+                                        state->extrablk.magic =
+                                                XFS_DIR2_FREE_MAGIC;
+                                        ASSERT(args->oknoent);
+                                        return XFS_ERROR(ENOENT);
+                                }
+                        }
+                }
+                /*
+                 * Not adding a new entry, so we really want to find
+                 * the name given to us.
+                 */
+                else {
+                        /*
+                         * If it's a different data block, go get it.
+                         */
+                        if (newdb != curdb) {
+                                /*
+                                 * If we had a block before, drop it.
+                                 */
+                                if (curbp)
+                                        xfs_da_brelse(tp, curbp);
+                                /*
+                                 * Read the data block.
+                                 */
+                                if ((error =
+                                    xfs_da_read_buf(tp, dp,
+                                            XFS_DIR2_DB_TO_DA(mp, newdb), -1,
+                                            &curbp, XFS_DATA_FORK))) {
+                                        return error;
+                                }
+                                xfs_dir2_data_check(dp, curbp);
+                                curdb = newdb;
+                        }
+                        /*
+                         * Point to the data entry.
+                         */
+                        dep = (xfs_dir2_data_entry_t *)
+                              ((char *)curbp->data +
+                               XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+                        /*
+                         * Compare the entry, return it if it matches.
+                         */
+                        if (dep->namelen == args->namelen &&
+                            dep->name[0] == args->name[0] &&
+                            memcmp(dep->name, args->name, args->namelen) == 0) {
+                                args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+                                *indexp = index;
+                                state->extravalid = 1;
+                                state->extrablk.bp = curbp;
+                                state->extrablk.blkno = curdb;
+                                state->extrablk.index =
+                                        (int)((char *)dep -
+                                              (char *)curbp->data);
+                                state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                                return XFS_ERROR(EEXIST);
+                        }
+                }
+        }
+        /*
+         * Didn't find a match.
+         * If we are holding a buffer, give it back in case our caller
+         * finds it useful.
+         */
+        if ((state->extravalid = (curbp != NULL))) {
+                state->extrablk.bp = curbp;
+                state->extrablk.index = -1;
+                /*
+                 * For addname, giving back a free block.
+                 */
+                if (args->addname) {
+                        state->extrablk.blkno = curfdb;
+                        state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+                }
+                /*
+                 * For other callers, giving back a data block.
+                 */
+                else {
+                        state->extrablk.blkno = curdb;
+                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                }
+        }
+        /*
+         * Return the final index, that will be the insertion point.
+         */
+        *indexp = index;
+        ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
+        return XFS_ERROR(ENOENT);
+}
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir2_leafn_moveents(
+        xfs_da_args_t   *args,                  /* operation arguments */
+        xfs_dabuf_t     *bp_s,                  /* source leaf buffer */
+        int             start_s,                /* source leaf index */
+        xfs_dabuf_t     *bp_d,                  /* destination leaf buffer */
+        int             start_d,                /* destination leaf index */
+        int             count)                  /* count of leaves to copy */
+{
+        xfs_dir2_leaf_t *leaf_d;                /* destination leaf structure */
+        xfs_dir2_leaf_t *leaf_s;                /* source leaf structure */
+        int             stale;                  /* count stale leaves copied */
+        xfs_trans_t     *tp;                    /* transaction pointer */
+        xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d,
+                start_d, count);
+        /*
+         * Silently return if nothing to do.
+         */
+        if (count == 0) {
+                return;
+        }
+        tp = args->trans;
+        leaf_s = bp_s->data;
+        leaf_d = bp_d->data;
+        /*
+         * If the destination index is not the end of the current
+         * destination leaf entries, open up a hole in the destination
+         * to hold the new entries.
+         */
+        if (start_d < INT_GET(leaf_d->hdr.count, ARCH_CONVERT)) {
+                memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d],
+                        (INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - start_d) *
+                        sizeof(xfs_dir2_leaf_entry_t));
+                xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
+                        count + INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - 1);
+        }
+        /*
+         * If the source has stale leaves, count the ones in the copy range
+         * so we can update the header correctly.
+         */
+        if (leaf_s->hdr.stale) {
+                int     i;                      /* temp leaf index */
+                for (i = start_s, stale = 0; i < start_s + count; i++) {
+                        if (INT_GET(leaf_s->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+                                stale++;
+                }
+        } else
+                stale = 0;
+        /*
+         * Copy the leaf entries from source to destination.
+         */
+        memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s],
+                count * sizeof(xfs_dir2_leaf_entry_t));
+        xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+        /*
+         * If there are source entries after the ones we copied,
+         * delete the ones we copied by sliding the next ones down.
+         */
+        if (start_s + count < INT_GET(leaf_s->hdr.count, ARCH_CONVERT)) {
+                memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count],
+                        count * sizeof(xfs_dir2_leaf_entry_t));
+                xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+        }
+        /*
+         * Update the headers and log them.
+         */
+        INT_MOD(leaf_s->hdr.count, ARCH_CONVERT, -(count));
+        INT_MOD(leaf_s->hdr.stale, ARCH_CONVERT, -(stale));
+        INT_MOD(leaf_d->hdr.count, ARCH_CONVERT, count);
+        INT_MOD(leaf_d->hdr.stale, ARCH_CONVERT, stale);
+        xfs_dir2_leaf_log_header(tp, bp_s);
+        xfs_dir2_leaf_log_header(tp, bp_d);
+        xfs_dir2_leafn_check(args->dp, bp_s);
+        xfs_dir2_leafn_check(args->dp, bp_d);
+}
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int                                             /* sort order */
+xfs_dir2_leafn_order(
+        xfs_dabuf_t     *leaf1_bp,              /* leaf1 buffer */
+        xfs_dabuf_t     *leaf2_bp)              /* leaf2 buffer */
+{
+        xfs_dir2_leaf_t *leaf1;                 /* leaf1 structure */
+        xfs_dir2_leaf_t *leaf2;                 /* leaf2 structure */
+        leaf1 = leaf1_bp->data;
+        leaf2 = leaf2_bp->data;
+        ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0 &&
+            INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0 &&
+            (INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT) < INT_GET(leaf1->ents[0].hashval, ARCH_CONVERT) ||
+             INT_GET(leaf2->ents[INT_GET(leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT) <
+             INT_GET(leaf1->ents[INT_GET(leaf1->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)))
+                return 1;
+        return 0;
+}
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+        xfs_da_state_t          *state,         /* btree cursor */
+        xfs_da_state_blk_t      *blk1,          /* first btree block */
+        xfs_da_state_blk_t      *blk2)          /* second btree block */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        int                     count;          /* count (& direction) leaves */
+        int                     isleft;         /* new goes in left leaf */
+        xfs_dir2_leaf_t         *leaf1;         /* first leaf structure */
+        xfs_dir2_leaf_t         *leaf2;         /* second leaf structure */
+        int                     mid;            /* midpoint leaf index */
+#ifdef DEBUG
+        int                     oldstale;       /* old count of stale leaves */
+#endif
+        int                     oldsum;         /* old total leaf count */
+        int                     swap;           /* swapped leaf blocks */
+        args = state->args;
+        /*
+         * If the block order is wrong, swap the arguments.
+         */
+        if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+                xfs_da_state_blk_t      *tmp;   /* temp for block swap */
+                tmp = blk1;
+                blk1 = blk2;
+                blk2 = tmp;
+        }
+        leaf1 = blk1->bp->data;
+        leaf2 = blk2->bp->data;
+        oldsum = INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT);
+#ifdef DEBUG
+        oldstale = INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT);
+#endif
+        mid = oldsum >> 1;
+        /*
+         * If the old leaf count was odd then the new one will be even,
+         * so we need to divide the new count evenly.
+         */
+        if (oldsum & 1) {
+                xfs_dahash_t    midhash;        /* middle entry hash value */
+                if (mid >= INT_GET(leaf1->hdr.count, ARCH_CONVERT))
+                        midhash = INT_GET(leaf2->ents[mid - INT_GET(leaf1->hdr.count, ARCH_CONVERT)].hashval, ARCH_CONVERT);
+                else
+                        midhash = INT_GET(leaf1->ents[mid].hashval, ARCH_CONVERT);
+                isleft = args->hashval <= midhash;
+        }
+        /*
+         * If the old count is even then the new count is odd, so there's
+         * no preferred side for the new entry.
+         * Pick the left one.
+         */
+        else
+                isleft = 1;
+        /*
+         * Calculate moved entry count.  Positive means left-to-right,
+         * negative means right-to-left.  Then move the entries.
+         */
+        count = INT_GET(leaf1->hdr.count, ARCH_CONVERT) - mid + (isleft == 0);
+        if (count > 0)
+                xfs_dir2_leafn_moveents(args, blk1->bp,
+                        INT_GET(leaf1->hdr.count, ARCH_CONVERT) - count, blk2->bp, 0, count);
+        else if (count < 0)
+                xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
+                        INT_GET(leaf1->hdr.count, ARCH_CONVERT), count);
+        ASSERT(INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT) == oldsum);
+        ASSERT(INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT) == oldstale);
+        /*
+         * Mark whether we're inserting into the old or new leaf.
+         */
+        if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) < INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+                state->inleaf = swap;
+        else if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+                state->inleaf = !swap;
+        else
+                state->inleaf =
+                        swap ^ (blk1->index <= INT_GET(leaf1->hdr.count, ARCH_CONVERT));
+        /*
+         * Adjust the expected index for insertion.
+         */
+        if (!state->inleaf)
+                blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+        
+        /* 
+         * Finally sanity check just to make sure we are not returning a negative index 
+         */
+        if(blk2->index < 0) {
+                state->inleaf = 1;
+                blk2->index = 0;
+                cmn_err(CE_ALERT,
+                        "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting orignal leaf: "
+                        "blk1->index %d\n",
+                        blk1->index);
+        }
+}
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int                                      /* error */
+xfs_dir2_leafn_remove(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             *bp,            /* leaf buffer */
+        int                     index,          /* leaf entry index */
+        xfs_da_state_blk_t      *dblk,          /* data block */
+        int                     *rval)          /* resulting block needs join */
+{
+        xfs_dir2_data_t         *data;          /* data block structure */
+        xfs_dir2_db_t           db;             /* data block number */
+        xfs_dabuf_t             *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data block entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry */
+        int                     longest;        /* longest data free entry */
+        int                     off;            /* data block entry offset */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data frees */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        xfs_dir2_trace_args_sb("leafn_remove", args, index, bp);
+        dp = args->dp;
+        tp = args->trans;
+        mp = dp->i_mount;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Point to the entry we're removing.
+         */
+        lep = &leaf->ents[index];
+        /*
+         * Extract the data block and offset from the entry.
+         */
+        db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+        ASSERT(dblk->blkno == db);
+        off = XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT));
+        ASSERT(dblk->index == off);
+        /*
+         * Kill the leaf entry by marking it stale.
+         * Log the leaf block changes.
+         */
+        INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
+        xfs_dir2_leaf_log_header(tp, bp);
+        INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+        xfs_dir2_leaf_log_ents(tp, bp, index, index);
+        /*
+         * Make the data entry free.  Keep track of the longest freespace
+         * in the data block in case it changes.
+         */
+        dbp = dblk->bp;
+        data = dbp->data;
+        dep = (xfs_dir2_data_entry_t *)((char *)data + off);
+        longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+        needlog = needscan = 0;
+        xfs_dir2_data_make_free(tp, dbp, off,
+                XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+        /*
+         * Rescan the data block freespaces for bestfree.
+         * Log the data block header if needed.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+        if (needlog)
+                xfs_dir2_data_log_header(tp, dbp);
+        xfs_dir2_data_check(dp, dbp);
+        /*
+         * If the longest data block freespace changes, need to update
+         * the corresponding freeblock entry.
+         */
+        if (longest < INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+                int             error;          /* error return value */
+                xfs_dabuf_t     *fbp;           /* freeblock buffer */
+                xfs_dir2_db_t   fdb;            /* freeblock block number */
+                int             findex;         /* index in freeblock entries */
+                xfs_dir2_free_t *free;          /* freeblock structure */
+                int             logfree;        /* need to log free entry */
+                /*
+                 * Convert the data block number to a free block,
+                 * read in the free block.
+                 */
+                fdb = XFS_DIR2_DB_TO_FDB(mp, db);
+                if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb),
+                                -1, &fbp, XFS_DATA_FORK))) {
+                        return error;
+                }
+                free = fbp->data;
+                ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+                ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) ==
+                       XFS_DIR2_MAX_FREE_BESTS(mp) *
+                       (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+                /*
+                 * Calculate which entry we need to fix.
+                 */
+                findex = XFS_DIR2_DB_TO_FDINDEX(mp, db);
+                longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+                /*
+                 * If the data block is now empty we can get rid of it
+                 * (usually).
+                 */
+                if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+                        /*
+                         * Try to punch out the data block.
+                         */
+                        error = xfs_dir2_shrink_inode(args, db, dbp);
+                        if (error == 0) {
+                                dblk->bp = NULL;
+                                data = NULL;
+                        }
+                        /*
+                         * We can get ENOSPC if there's no space reservation.
+                         * In this case just drop the buffer and some one else
+                         * will eventually get rid of the empty block.
+                         */
+                        else if (error == ENOSPC && args->total == 0)
+                                xfs_da_buf_done(dbp);
+                        else
+                                return error;
+                }
+                /*
+                 * If we got rid of the data block, we can eliminate that entry
+                 * in the free block.
+                 */
+                if (data == NULL) {
+                        /*
+                         * One less used entry in the free table.
+                         */
+                        INT_MOD(free->hdr.nused, ARCH_CONVERT, -1);
+                        xfs_dir2_free_log_header(tp, fbp);
+                        /*
+                         * If this was the last entry in the table, we can
+                         * trim the table size back.  There might be other
+                         * entries at the end referring to non-existent
+                         * data blocks, get those too.
+                         */
+                        if (findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1) {
+                                int     i;              /* free entry index */
+                                for (i = findex - 1;
+                                     i >= 0 && INT_GET(free->bests[i], ARCH_CONVERT) == NULLDATAOFF;
+                                     i--)
+                                        continue;
+                                INT_SET(free->hdr.nvalid, ARCH_CONVERT, i + 1);
+                                logfree = 0;
+                        }
+                        /*
+                         * Not the last entry, just punch it out.
+                         */
+                        else {
+                                INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
+                                logfree = 1;
+                        }
+                        /*
+                         * If there are no useful entries left in the block,
+                         * get rid of the block if we can.
+                         */
+                        if (!free->hdr.nused) {
+                                error = xfs_dir2_shrink_inode(args, fdb, fbp);
+                                if (error == 0) {
+                                        fbp = NULL;
+                                        logfree = 0;
+                                } else if (error != ENOSPC || args->total != 0)
+                                        return error;
+                                /*
+                                 * It's possible to get ENOSPC if there is no
+                                 * space reservation.  In this case some one
+                                 * else will eventually get rid of this block.
+                                 */
+                        }
+                }
+                /*
+                 * Data block is not empty, just set the free entry to
+                 * the new value.
+                 */
+                else {
+                        INT_SET(free->bests[findex], ARCH_CONVERT, longest);
+                        logfree = 1;
+                }
+                /*
+                 * Log the free entry that changed, unless we got rid of it.
+                 */
+                if (logfree)
+                        xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+                /*
+                 * Drop the buffer if we still have it.
+                 */
+                if (fbp)
+                        xfs_da_buf_done(fbp);
+        }
+        xfs_dir2_leafn_check(dp, bp);
+        /*
+         * Return indication of whether this leaf block is emtpy enough
+         * to justify trying to join it with a neighbor.
+         */
+        *rval =
+                ((uint)sizeof(leaf->hdr) +
+                 (uint)sizeof(leaf->ents[0]) *
+                 (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT))) <
+                mp->m_dir_magicpct;
+        return 0;
+}
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int                                             /* error */
+xfs_dir2_leafn_split(
+        xfs_da_state_t          *state,         /* btree cursor */
+        xfs_da_state_blk_t      *oldblk,        /* original block */
+        xfs_da_state_blk_t      *newblk)        /* newly created block */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        xfs_dablk_t             blkno;          /* new leaf block number */
+        int                     error;          /* error return value */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        /*
+         * Allocate space for a new leaf node.
+         */
+        args = state->args;
+        mp = args->dp->i_mount;
+        ASSERT(args != NULL);
+        ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error) {
+                return error;
+        }
+        /*
+         * Initialize the new leaf block.
+         */
+        error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno),
+                &newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+        if (error) {
+                return error;
+        }
+        newblk->blkno = blkno;
+        newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+        /*
+         * Rebalance the entries across the two leaves, link the new
+         * block into the leaves.
+         */
+        xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+        error = xfs_da_blk_link(state, oldblk, newblk);
+        if (error) {
+                return error;
+        }
+        /*
+         * Insert the new entry in the correct block.
+         */
+        if (state->inleaf)
+                error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+        else
+                error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+        /*
+         * Update last hashval in each block since we added the name.
+         */
+        oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
+        newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
+        xfs_dir2_leafn_check(args->dp, oldblk->bp);
+        xfs_dir2_leafn_check(args->dp, newblk->bp);
+        return error;
+}
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int                                             /* error */
+xfs_dir2_leafn_toosmall(
+        xfs_da_state_t          *state,         /* btree cursor */
+        int                     *action)        /* resulting action to take */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block */
+        xfs_dablk_t             blkno;          /* leaf block number */
+        xfs_dabuf_t             *bp;            /* leaf buffer */
+        int                     bytes;          /* bytes in use */
+        int                     count;          /* leaf live entry count */
+        int                     error;          /* error return value */
+        int                     forward;        /* sibling block direction */
+        int                     i;              /* sibling counter */
+        xfs_da_blkinfo_t        *info;          /* leaf block header */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        int                     rval;           /* result from path_shift */
+        /*
+         * Check for the degenerate case of the block being over 50% full.
+         * If so, it's not worth even looking to see if we might be able
+         * to coalesce with a sibling.
+         */
+        blk = &state->path.blk[state->path.active - 1];
+        info = blk->bp->data;
+        ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        leaf = (xfs_dir2_leaf_t *)info;
+        count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+        bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
+        if (bytes > (state->blocksize >> 1)) {
+                /*
+                 * Blk over 50%, don't try to join.
+                 */
+                *action = 0;
+                return 0;
+        }
+        /*
+         * Check for the degenerate case of the block being empty.
+         * If the block is empty, we'll simply delete it, no need to
+         * coalesce it with a sibling block.  We choose (arbitrarily)
+         * to merge with the forward block unless it is NULL.
+         */
+        if (count == 0) {
+                /*
+                 * Make altpath point to the block we want to keep and
+                 * path point to the block we want to drop (this one).
+                 */
+                forward = info->forw;
+                memcpy(&state->altpath, &state->path, sizeof(state->path));
+                error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+                        &rval);
+                if (error)
+                        return error;
+                *action = rval ? 2 : 0;
+                return 0;
+        }
+        /*
+         * Examine each sibling block to see if we can coalesce with
+         * at least 25% free space to spare.  We need to figure out
+         * whether to merge with the forward or the backward block.
+         * We prefer coalescing with the lower numbered sibling so as
+         * to shrink a directory over time.
+         */
+        forward = INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT);
+        for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+                blkno = forward ?INT_GET( info->forw, ARCH_CONVERT) : INT_GET(info->back, ARCH_CONVERT);
+                if (blkno == 0)
+                        continue;
+                /*
+                 * Read the sibling leaf block.
+                 */
+                if ((error =
+                    xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
+                            -1, &bp, XFS_DATA_FORK))) {
+                        return error;
+                }
+                ASSERT(bp != NULL);
+                /*
+                 * Count bytes in the two blocks combined.
+                 */
+                leaf = (xfs_dir2_leaf_t *)info;
+                count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+                bytes = state->blocksize - (state->blocksize >> 2);
+                leaf = bp->data;
+                ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+                count += INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+                bytes -= count * (uint)sizeof(leaf->ents[0]);
+                /*
+                 * Fits with at least 25% to spare.
+                 */
+                if (bytes >= 0)
+                        break;
+                xfs_da_brelse(state->args->trans, bp);
+        }
+        /*
+         * Didn't like either block, give up.
+         */
+        if (i >= 2) {
+                *action = 0;
+                return 0;
+        }
+        /*
+         * Done with the sibling leaf block here, drop the dabuf
+         * so path_shift can get it.
+         */
+        xfs_da_buf_done(bp);
+        /*
+         * Make altpath point to the block we want to keep (the lower
+         * numbered block) and path point to the block we want to drop.
+         */
+        memcpy(&state->altpath, &state->path, sizeof(state->path));
+        if (blkno < blk->blkno)
+                error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+                        &rval);
+        else
+                error = xfs_da_path_shift(state, &state->path, forward, 0,
+                        &rval);
+        if (error) {
+                return error;
+        }
+        *action = rval ? 0 : 1;
+        return 0;
+}
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+        xfs_da_state_t          *state,         /* cursor */
+        xfs_da_state_blk_t      *drop_blk,      /* dead block */
+        xfs_da_state_blk_t      *save_blk)      /* surviving block */
+{
+        xfs_da_args_t           *args;          /* operation arguments */
+        xfs_dir2_leaf_t         *drop_leaf;     /* dead leaf structure */
+        xfs_dir2_leaf_t         *save_leaf;     /* surviving leaf structure */
+        args = state->args;
+        ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        drop_leaf = drop_blk->bp->data;
+        save_leaf = save_blk->bp->data;
+        ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * If there are any stale leaf entries, take this opportunity
+         * to purge them.
+         */
+        if (INT_GET(drop_leaf->hdr.stale, ARCH_CONVERT))
+                xfs_dir2_leaf_compact(args, drop_blk->bp);
+        if (INT_GET(save_leaf->hdr.stale, ARCH_CONVERT))
+                xfs_dir2_leaf_compact(args, save_blk->bp);
+        /*
+         * Move the entries from drop to the appropriate end of save.
+         */
+        drop_blk->hashval = INT_GET(drop_leaf->ents[INT_GET(drop_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+        if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
+                xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
+                        INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
+        else
+                xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
+                        INT_GET(save_leaf->hdr.count, ARCH_CONVERT), INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
+        save_blk->hashval = INT_GET(save_leaf->ents[INT_GET(save_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+        xfs_dir2_leafn_check(args->dp, save_blk->bp);
+}
+/*
+ * Top-level node form directory addname routine.
+ */
+int                                             /* error */
+xfs_dir2_node_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block for insert */
+        int                     error;          /* error return value */
+        int                     rval;           /* sub-return value */
+        xfs_da_state_t          *state;         /* btree cursor */
+        xfs_dir2_trace_args("node_addname", args);
+        /*
+         * Allocate and initialize the state (btree cursor).
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_dirblksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        /*
+         * Look up the name.  We're not supposed to find it, but
+         * this gives us the insertion point.
+         */
+        error = xfs_da_node_lookup_int(state, &rval);
+        if (error)
+                rval = error;
+        if (rval != ENOENT) {
+                goto done;
+        }
+        /*
+         * Add the data entry to a data block.
+         * Extravalid is set to a freeblock found by lookup.
+         */
+        rval = xfs_dir2_node_addname_int(args,
+                state->extravalid ? &state->extrablk : NULL);
+        if (rval) {
+                goto done;
+        }
+        blk = &state->path.blk[state->path.active - 1];
+        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        /*
+         * Add the new leaf entry.
+         */
+        rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+        if (rval == 0) {
+                /*
+                 * It worked, fix the hash values up the btree.
+                 */
+                if (!args->justcheck)
+                        xfs_da_fixhashpath(state, &state->path);
+        } else {
+                /*
+                 * It didn't work, we need to split the leaf block.
+                 */
+                if (args->total == 0) {
+                        ASSERT(rval == ENOSPC);
+                        goto done;
+                }
+                /*
+                 * Split the leaf block and insert the new entry.
+                 */
+                rval = xfs_da_split(state);
+        }
+done:
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int                                      /* error */
+xfs_dir2_node_addname_int(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_da_state_blk_t      *fblk)          /* optional freespace block */
+{
+        xfs_dir2_data_t         *data;          /* data block structure */
+        xfs_dir2_db_t           dbno;           /* data block number */
+        xfs_dabuf_t             *dbp;           /* data block buffer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* data unused entry pointer */
+        int                     error;          /* error return value */
+        xfs_dir2_db_t           fbno;           /* freespace block number */
+        xfs_dabuf_t             *fbp;           /* freespace buffer */
+        int                     findex;         /* freespace entry index */
+        xfs_dir2_free_t         *free=NULL;     /* freespace block structure */
+        xfs_dir2_db_t           ifbno;          /* initial freespace block no */
+        xfs_dir2_db_t           lastfbno=0;     /* highest freespace block no */
+        int                     length;         /* length of the new entry */
+        int                     logfree;        /* need to log free entry */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        int                     needlog;        /* need to log data header */
+        int                     needscan;       /* need to rescan data frees */
+        xfs_dir2_data_off_t     *tagp;          /* data entry tag pointer */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+        /*
+         * If we came in with a freespace block that means that lookup
+         * found an entry with our hash value.  This is the freespace
+         * block for that data entry.
+         */
+        if (fblk) {
+                fbp = fblk->bp;
+                /*
+                 * Remember initial freespace block number.
+                 */
+                ifbno = fblk->blkno;
+                free = fbp->data;
+                ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+                findex = fblk->index;
+                /*
+                 * This means the free entry showed that the data block had
+                 * space for our entry, so we remembered it.
+                 * Use that data block.
+                 */
+                if (findex >= 0) {
+                        ASSERT(findex < INT_GET(free->hdr.nvalid, ARCH_CONVERT));
+                        ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF);
+                        ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) >= length);
+                        dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
+                }
+                /*
+                 * The data block looked at didn't have enough room.
+                 * We'll start at the beginning of the freespace entries.
+                 */
+                else {
+                        dbno = -1;
+                        findex = 0;
+                }
+        }
+        /*
+         * Didn't come in with a freespace block, so don't have a data block.
+         */
+        else {
+                ifbno = dbno = -1;
+                fbp = NULL;
+                findex = 0;
+        }
+        /*
+         * If we don't have a data block yet, we're going to scan the
+         * freespace blocks looking for one.  Figure out what the
+         * highest freespace block number is.
+         */
+        if (dbno == -1) {
+                xfs_fileoff_t   fo;             /* freespace block number */
+                if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+                        return error;
+                lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo);
+                fbno = ifbno;
+        }
+        /*
+         * While we haven't identified a data block, search the freeblock
+         * data for a good data block.  If we find a null freeblock entry,
+         * indicating a hole in the data blocks, remember that.
+         */
+        while (dbno == -1) {
+                /*
+                 * If we don't have a freeblock in hand, get the next one.
+                 */
+                if (fbp == NULL) {
+                        /*
+                         * Happens the first time through unless lookup gave
+                         * us a freespace block to start with.
+                         */
+                        if (++fbno == 0)
+                                fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+                        /*
+                         * If it's ifbno we already looked at it.
+                         */
+                        if (fbno == ifbno)
+                                fbno++;
+                        /*
+                         * If it's off the end we're done.
+                         */
+                        if (fbno >= lastfbno)
+                                break;
+                        /*
+                         * Read the block.  There can be holes in the
+                         * freespace blocks, so this might not succeed.
+                         * This should be really rare, so there's no reason
+                         * to avoid it.
+                         */
+                        if ((error = xfs_da_read_buf(tp, dp,
+                                        XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp,
+                                        XFS_DATA_FORK))) {
+                                return error;
+                        }
+                        if (unlikely(fbp == NULL)) {
+                                continue;
+                        }
+                        free = fbp->data;
+                        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+                        findex = 0;
+                }
+                /*
+                 * Look at the current free entry.  Is it good enough?
+                 */
+                if (INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF &&
+                    INT_GET(free->bests[findex], ARCH_CONVERT) >= length)
+                        dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
+                else {
+                        /*
+                         * Are we done with the freeblock?
+                         */
+                        if (++findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
+                                /*
+                                 * Drop the block.
+                                 */
+                                xfs_da_brelse(tp, fbp);
+                                fbp = NULL;
+                                if (fblk && fblk->bp)
+                                        fblk->bp = NULL;
+                        }
+                }
+        }
+        /*
+         * If we don't have a data block, we need to allocate one and make
+         * the freespace entries refer to it.
+         */
+        if (unlikely(dbno == -1)) {
+                /*
+                 * Not allowed to allocate, return failure.
+                 */
+                if (args->justcheck || args->total == 0) {
+                        /*
+                         * Drop the freespace buffer unless it came from our
+                         * caller.
+                         */
+                        if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+                                xfs_da_buf_done(fbp);
+                        return XFS_ERROR(ENOSPC);
+                }
+                /*
+                 * Allocate and initialize the new data block.
+                 */
+                if (unlikely((error = xfs_dir2_grow_inode(args,
+                                                         XFS_DIR2_DATA_SPACE,
+                                                         &dbno)) ||
+                    (error = xfs_dir2_data_init(args, dbno, &dbp)))) {
+                        /*
+                         * Drop the freespace buffer unless it came from our
+                         * caller.
+                         */
+                        if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+                                xfs_da_buf_done(fbp);
+                        return error;
+                }
+                /*
+                 * If (somehow) we have a freespace block, get rid of it.
+                 */
+                if (fbp)
+                        xfs_da_brelse(tp, fbp);
+                if (fblk && fblk->bp)
+                        fblk->bp = NULL;
+                /*
+                 * Get the freespace block corresponding to the data block
+                 * that was just allocated.
+                 */
+                fbno = XFS_DIR2_DB_TO_FDB(mp, dbno);
+                if (unlikely(error = xfs_da_read_buf(tp, dp,
+                                XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp,
+                                XFS_DATA_FORK))) {
+                        xfs_da_buf_done(dbp);
+                        return error;
+                }
+                /*
+                 * If there wasn't a freespace block, the read will
+                 * return a NULL fbp.  Allocate and initialize a new one.
+                 */
+                if( fbp == NULL ) {
+                        if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+                                                        &fbno))) {
+                                return error;
+                        }
+                        if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) {
+                                cmn_err(CE_ALERT,
+                                        "xfs_dir2_node_addname_int: dir ino "
+                                        "%llu needed freesp block %lld for\n"
+                                        "  data block %lld, got %lld\n"
+                                        "  ifbno %llu lastfbno %d\n",
+                                        (unsigned long long)dp->i_ino,
+                                        (long long)XFS_DIR2_DB_TO_FDB(mp, dbno),
+                                        (long long)dbno, (long long)fbno,
+                                        (unsigned long long)ifbno, lastfbno);
+                                if (fblk) {
+                                        cmn_err(CE_ALERT,
+                                                " fblk 0x%p blkno %llu "
+                                                "index %d magic 0x%x\n",
+                                                fblk,
+                                                (unsigned long long)fblk->blkno,
+                                                fblk->index,
+                                                fblk->magic);
+                                } else {
+                                        cmn_err(CE_ALERT,
+                                                " ... fblk is NULL\n");
+                                }
+                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
+                                                 XFS_ERRLEVEL_LOW, mp);
+                                return XFS_ERROR(EFSCORRUPTED);
+                        }
+                        /*
+                         * Get a buffer for the new block.
+                         */
+                        if ((error = xfs_da_get_buf(tp, dp,
+                                                   XFS_DIR2_DB_TO_DA(mp, fbno),
+                                                   -1, &fbp, XFS_DATA_FORK))) {
+                                return error;
+                        }
+                        ASSERT(fbp != NULL);
+                        /*
+                         * Initialize the new block to be empty, and remember
+                         * its first slot as our empty slot.
+                         */
+                        free = fbp->data;
+                        INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
+                        INT_SET(free->hdr.firstdb, ARCH_CONVERT,
+                                (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
+                                XFS_DIR2_MAX_FREE_BESTS(mp));
+                        free->hdr.nvalid = 0;
+                        free->hdr.nused = 0;
+                } else {
+                        free = fbp->data;
+                        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+                }
+                /*
+                 * Set the freespace block index from the data block number.
+                 */
+                findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno);
+                /*
+                 * If it's after the end of the current entries in the
+                 * freespace block, extend that table.
+                 */
+                if (findex >= INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
+                        ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp));
+                        INT_SET(free->hdr.nvalid, ARCH_CONVERT, findex + 1);
+                        /*
+                         * Tag new entry so nused will go up.
+                         */
+                        INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
+                }
+                /*
+                 * If this entry was for an empty data block
+                 * (this should always be true) then update the header.
+                 */
+                if (INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) {
+                        INT_MOD(free->hdr.nused, ARCH_CONVERT, +1);
+                        xfs_dir2_free_log_header(tp, fbp);
+                }
+                /*
+                 * Update the real value in the table.
+                 * We haven't allocated the data entry yet so this will
+                 * change again.
+                 */
+                data = dbp->data;
+                INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+                logfree = 1;
+        }
+        /*
+         * We had a data block so we don't have to make a new one.
+         */
+        else {
+                /*
+                 * If just checking, we succeeded.
+                 */
+                if (args->justcheck) {
+                        if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+                                xfs_da_buf_done(fbp);
+                        return 0;
+                }
+                /*
+                 * Read the data block in.
+                 */
+                if (unlikely(
+                    error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno),
+                                -1, &dbp, XFS_DATA_FORK))) {
+                        if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+                                xfs_da_buf_done(fbp);
+                        return error;
+                }
+                data = dbp->data;
+                logfree = 0;
+        }
+        ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) >= length);
+        /*
+         * Point to the existing unused space.
+         */
+        dup = (xfs_dir2_data_unused_t *)
+              ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
+        needscan = needlog = 0;
+        /*
+         * Mark the first part of the unused space, inuse for us.
+         */
+        xfs_dir2_data_use_free(tp, dbp, dup,
+                (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+                &needlog, &needscan);
+        /*
+         * Fill in the new entry and log it.
+         */
+        dep = (xfs_dir2_data_entry_t *)dup;
+        INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+        dep->namelen = args->namelen;
+        memcpy(dep->name, args->name, dep->namelen);
+        tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+        INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
+        xfs_dir2_data_log_entry(tp, dbp, dep);
+        /*
+         * Rescan the block for bestfree if needed.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+        /*
+         * Log the data block header if needed.
+         */
+        if (needlog)
+                xfs_dir2_data_log_header(tp, dbp);
+        /*
+         * If the freespace entry is now wrong, update it.
+         */
+        if (INT_GET(free->bests[findex], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+                INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+                logfree = 1;
+        }
+        /*
+         * Log the freespace entry if needed.
+         */
+        if (logfree)
+                xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+        /*
+         * If the caller didn't hand us the freespace block, drop it.
+         */
+        if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
+                xfs_da_buf_done(fbp);
+        /*
+         * Return the data block and offset in args, then drop the data block.
+         */
+        args->blkno = (xfs_dablk_t)dbno;
+        args->index = INT_GET(*tagp, ARCH_CONVERT);
+        xfs_da_buf_done(dbp);
+        return 0;
+}
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int                                             /* error */
+xfs_dir2_node_lookup(
+        xfs_da_args_t   *args)                  /* operation arguments */
+{
+        int             error;                  /* error return value */
+        int             i;                      /* btree level */
+        int             rval;                   /* operation return value */
+        xfs_da_state_t  *state;                 /* btree cursor */
+        xfs_dir2_trace_args("node_lookup", args);
+        /*
+         * Allocate and initialize the btree cursor.
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_dirblksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        /*
+         * Fill in the path to the entry in the cursor.
+         */
+        error = xfs_da_node_lookup_int(state, &rval);
+        if (error)
+                rval = error;
+        /*
+         * Release the btree blocks and leaf block.
+         */
+        for (i = 0; i < state->path.active; i++) {
+                xfs_da_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        /*
+         * Release the data block if we have it.
+         */
+        if (state->extravalid && state->extrablk.bp) {
+                xfs_da_brelse(args->trans, state->extrablk.bp);
+                state->extrablk.bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
+ * Remove an entry from a node-format directory.
+ */
+int                                             /* error */
+xfs_dir2_node_removename(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block */
+        int                     error;          /* error return value */
+        int                     rval;           /* operation return value */
+        xfs_da_state_t          *state;         /* btree cursor */
+        xfs_dir2_trace_args("node_removename", args);
+        /*
+         * Allocate and initialize the btree cursor.
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_dirblksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        /*
+         * Look up the entry we're deleting, set up the cursor.
+         */
+        error = xfs_da_node_lookup_int(state, &rval);
+        if (error) {
+                rval = error;
+        }
+        /*
+         * Didn't find it, upper layer screwed up.
+         */
+        if (rval != EEXIST) {
+                xfs_da_state_free(state);
+                return rval;
+        }
+        blk = &state->path.blk[state->path.active - 1];
+        ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+        ASSERT(state->extravalid);
+        /*
+         * Remove the leaf and data entries.
+         * Extrablk refers to the data block.
+         */
+        error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+                &state->extrablk, &rval);
+        if (error) {
+                return error;
+        }
+        /*
+         * Fix the hash values up the btree.
+         */
+        xfs_da_fixhashpath(state, &state->path);
+        /*
+         * If we need to join leaf blocks, do it.
+         */
+        if (rval && state->path.active > 1)
+                error = xfs_da_join(state);
+        /*
+         * If no errors so far, try conversion to leaf format.
+         */
+        if (!error)
+                error = xfs_dir2_node_to_leaf(state);
+        xfs_da_state_free(state);
+        return error;
+}
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int                                             /* error */
+xfs_dir2_node_replace(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_da_state_blk_t      *blk;           /* leaf block */
+        xfs_dir2_data_t         *data;          /* data block structure */
+        xfs_dir2_data_entry_t   *dep;           /* data entry changed */
+        int                     error;          /* error return value */
+        int                     i;              /* btree level */
+        xfs_ino_t               inum;           /* new inode number */
+        xfs_dir2_leaf_t         *leaf;          /* leaf structure */
+        xfs_dir2_leaf_entry_t   *lep;           /* leaf entry being changed */
+        int                     rval;           /* internal return value */
+        xfs_da_state_t          *state;         /* btree cursor */
+        xfs_dir2_trace_args("node_replace", args);
+        /*
+         * Allocate and initialize the btree cursor.
+         */
+        state = xfs_da_state_alloc();
+        state->args = args;
+        state->mp = args->dp->i_mount;
+        state->blocksize = state->mp->m_dirblksize;
+        state->node_ents = state->mp->m_dir_node_ents;
+        inum = args->inumber;
+        /*
+         * Lookup the entry to change in the btree.
+         */
+        error = xfs_da_node_lookup_int(state, &rval);
+        if (error) {
+                rval = error;
+        }
+        /*
+         * It should be found, since the vnodeops layer has looked it up
+         * and locked it.  But paranoia is good.
+         */
+        if (rval == EEXIST) {
+                /*
+                 * Find the leaf entry.
+                 */
+                blk = &state->path.blk[state->path.active - 1];
+                ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+                leaf = blk->bp->data;
+                lep = &leaf->ents[blk->index];
+                ASSERT(state->extravalid);
+                /*
+                 * Point to the data entry.
+                 */
+                data = state->extrablk.bp->data;
+                ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+                dep = (xfs_dir2_data_entry_t *)
+                      ((char *)data +
+                       XFS_DIR2_DATAPTR_TO_OFF(state->mp, INT_GET(lep->address, ARCH_CONVERT)));
+                ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
+                /*
+                 * Fill in the new inode number and log the entry.
+                 */
+                INT_SET(dep->inumber, ARCH_CONVERT, inum);
+                xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+                rval = 0;
+        }
+        /*
+         * Didn't find it, and we're holding a data block.  Drop it.
+         */
+        else if (state->extravalid) {
+                xfs_da_brelse(args->trans, state->extrablk.bp);
+                state->extrablk.bp = NULL;
+        }
+        /*
+         * Release all the buffers in the cursor.
+         */
+        for (i = 0; i < state->path.active; i++) {
+                xfs_da_brelse(args->trans, state->path.blk[i].bp);
+                state->path.blk[i].bp = NULL;
+        }
+        xfs_da_state_free(state);
+        return rval;
+}
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int                                             /* error */
+xfs_dir2_node_trim_free(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_fileoff_t           fo,             /* free block number */
+        int                     *rvalp)         /* out: did something */
+{
+        xfs_dabuf_t             *bp;            /* freespace buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return code */
+        xfs_dir2_free_t         *free;          /* freespace structure */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_trans_t             *tp;            /* transaction pointer */
+        dp = args->dp;
+        mp = dp->i_mount;
+        tp = args->trans;
+        /*
+         * Read the freespace block.
+         */
+        if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
+                        XFS_DATA_FORK))) {
+                return error;
+        }
+        /*
+         * There can be holes in freespace.  If fo is a hole, there's
+         * nothing to do.
+         */
+        if (bp == NULL) {
+                return 0;
+        }
+        free = bp->data;
+        ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+        /*
+         * If there are used entries, there's nothing to do.
+         */
+        if (INT_GET(free->hdr.nused, ARCH_CONVERT) > 0) {
+                xfs_da_brelse(tp, bp);
+                *rvalp = 0;
+                return 0;
+        }
+        /*
+         * Blow the block away.
+         */
+        if ((error =
+            xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo),
+                    bp))) {
+                /*
+                 * Can't fail with ENOSPC since that only happens with no
+                 * space reservation, when breaking up an extent into two
+                 * pieces.  This is the last block of an extent.
+                 */
+                ASSERT(error != ENOSPC);
+                xfs_da_brelse(tp, bp);
+                return error;
+        }
+        /*
+         * Return that we succeeded.
+         */
+        *rvalp = 1;
+        return 0;
+}
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
new file mode 100644
index 000000000000..96db420c7c5c
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_NODE_H__
+#define __XFS_DIR2_NODE_H__
+/*
+ * Directory version 2, btree node format structures
+ */
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+/*
+ * Constants.
+ */
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE     2
+#define XFS_DIR2_FREE_OFFSET    (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_FREE_FIRSTDB(mp)       \
+        XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET)
+#define XFS_DIR2_FREE_MAGIC     0x58443246      /* XD2F */
+/*
+ * Structures.
+ */
+typedef struct xfs_dir2_free_hdr {
+        __uint32_t              magic;          /* XFS_DIR2_FREE_MAGIC */
+        __int32_t               firstdb;        /* db of first entry */
+        __int32_t               nvalid;         /* count of valid entries */
+        __int32_t               nused;          /* count of used entries */
+} xfs_dir2_free_hdr_t;
+typedef struct xfs_dir2_free {
+        xfs_dir2_free_hdr_t     hdr;            /* block header */
+        xfs_dir2_data_off_t     bests[1];       /* best free counts */
+                                                /* unused entries are -1 */
+} xfs_dir2_free_t;
+#define XFS_DIR2_MAX_FREE_BESTS(mp)     \
+        (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \
+         (uint)sizeof(xfs_dir2_data_off_t))
+/*
+ * Macros.
+ */
+/*
+ * Convert data space db to the corresponding free db.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDB)
+xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_FDB(mp,db)       xfs_dir2_db_to_fdb(mp, db)
+#else
+#define XFS_DIR2_DB_TO_FDB(mp,db)       \
+        (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp))
+#endif
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
+int
+xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_FDINDEX(mp,db)   xfs_dir2_db_to_fdindex(mp, db)
+#else
+#define XFS_DIR2_DB_TO_FDINDEX(mp,db)   ((db) % XFS_DIR2_MAX_FREE_BESTS(mp))
+#endif
+/*
+ * Functions.
+ */
+extern void
+        xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+                                int first, int last);
+extern int
+        xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_dabuf *lbp);
+extern xfs_dahash_t
+        xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
+extern int
+        xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
+                                  struct xfs_da_args *args, int *indexp,
+                                  struct xfs_da_state *state);
+extern int
+        xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
+                             struct xfs_dabuf *leaf2_bp);
+extern int
+        xfs_dir2_leafn_split(struct xfs_da_state *state,
+                             struct xfs_da_state_blk *oldblk,
+                             struct xfs_da_state_blk *newblk);
+extern int
+        xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+extern void
+        xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+                                 struct xfs_da_state_blk *drop_blk,
+                                 struct xfs_da_state_blk *save_blk);
+extern int
+        xfs_dir2_node_addname(struct xfs_da_args *args);
+extern int
+        xfs_dir2_node_lookup(struct xfs_da_args *args);
+extern int
+        xfs_dir2_node_removename(struct xfs_da_args *args);
+extern int
+        xfs_dir2_node_replace(struct xfs_da_args *args);
+extern int
+        xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+                                int *rvalp);
+#endif  /* __XFS_DIR2_NODE_H__ */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
new file mode 100644
index 000000000000..6bbc61674411
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -0,0 +1,1317 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_dir2_sf.c
+ * Shortform directory implementation for v2 directories.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_error.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_trace.h"
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+                                     xfs_dir2_sf_entry_t *sfep,
+                                     xfs_dir2_data_aoff_t offset,
+                                     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+                                     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+                                    xfs_dir2_sf_entry_t **sfepp,
+                                    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+#if XFS_BIG_INUMS
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+#endif /* XFS_BIG_INUMS */
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int                                             /* size for sf form */
+xfs_dir2_block_sfsize(
+        xfs_inode_t             *dp,            /* incore inode pointer */
+        xfs_dir2_block_t        *block,         /* block directory data */
+        xfs_dir2_sf_hdr_t       *sfhp)          /* output: header for sf form */
+{
+        xfs_dir2_dataptr_t      addr;           /* data entry address */
+        xfs_dir2_leaf_entry_t   *blp;           /* leaf area of the block */
+        xfs_dir2_block_tail_t   *btp;           /* tail area of the block */
+        int                     count;          /* shortform entry count */
+        xfs_dir2_data_entry_t   *dep;           /* data entry in the block */
+        int                     i;              /* block entry index */
+        int                     i8count;        /* count of big-inode entries */
+        int                     isdot;          /* entry is "." */
+        int                     isdotdot;       /* entry is ".." */
+        xfs_mount_t             *mp;            /* mount structure pointer */
+        int                     namelen;        /* total name bytes */
+        xfs_ino_t               parent;         /* parent inode number */
+        int                     size=0;         /* total computed size */
+        mp = dp->i_mount;
+        count = i8count = namelen = 0;
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        blp = XFS_DIR2_BLOCK_LEAF_P(btp);
+        /*
+         * Iterate over the block's data entries by using the leaf pointers.
+         */
+        for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+                if ((addr = INT_GET(blp[i].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+                        continue;
+                /*
+                 * Calculate the pointer to the entry at hand.
+                 */
+                dep = (xfs_dir2_data_entry_t *)
+                      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+                /*
+                 * Detect . and .., so we can special-case them.
+                 * . is not included in sf directories.
+                 * .. is included by just the parent inode number.
+                 */
+                isdot = dep->namelen == 1 && dep->name[0] == '.';
+                isdotdot =
+                        dep->namelen == 2 &&
+                        dep->name[0] == '.' && dep->name[1] == '.';
+#if XFS_BIG_INUMS
+                if (!isdot)
+                        i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
+#endif
+                if (!isdot && !isdotdot) {
+                        count++;
+                        namelen += dep->namelen;
+                } else if (isdotdot)
+                        parent = INT_GET(dep->inumber, ARCH_CONVERT);
+                /*
+                 * Calculate the new size, see if we should give up yet.
+                 */
+                size = XFS_DIR2_SF_HDR_SIZE(i8count) +          /* header */
+                       count +                                  /* namelen */
+                       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+                       namelen +                                /* name */
+                       (i8count ?                               /* inumber */
+                                (uint)sizeof(xfs_dir2_ino8_t) * count :
+                                (uint)sizeof(xfs_dir2_ino4_t) * count);
+                if (size > XFS_IFORK_DSIZE(dp))
+                        return size;            /* size value is a failure */
+        }
+        /*
+         * Create the output header, if it worked.
+         */
+        sfhp->count = count;
+        sfhp->i8count = i8count;
+        XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent);
+        return size;
+}
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int                                             /* error */
+xfs_dir2_block_to_sf(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dabuf_t             *bp,            /* block buffer */
+        int                     size,           /* shortform directory size */
+        xfs_dir2_sf_hdr_t       *sfhp)          /* shortform directory hdr */
+{
+        xfs_dir2_block_t        *block;         /* block structure */
+        xfs_dir2_block_tail_t   *btp;           /* block tail pointer */
+        xfs_dir2_data_entry_t   *dep;           /* data entry pointer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_data_unused_t  *dup;           /* unused data pointer */
+        char                    *endptr;        /* end of data entries */
+        int                     error;          /* error return value */
+        int                     logflags;       /* inode logging flags */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        char                    *ptr;           /* current data pointer */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        xfs_ino_t               temp;
+        xfs_dir2_trace_args_sb("block_to_sf", args, size, bp);
+        dp = args->dp;
+        mp = dp->i_mount;
+        /*
+         * Make a copy of the block data, so we can shrink the inode
+         * and add local data.
+         */
+        block = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
+        memcpy(block, bp->data, mp->m_dirblksize);
+        logflags = XFS_ILOG_CORE;
+        if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
+                ASSERT(error != ENOSPC);
+                goto out;
+        }
+        /*
+         * The buffer is now unconditionally gone, whether
+         * xfs_dir2_shrink_inode worked or not.
+         *
+         * Convert the inode to local format.
+         */
+        dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+        dp->i_df.if_flags |= XFS_IFINLINE;
+        dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+        ASSERT(dp->i_df.if_bytes == 0);
+        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        logflags |= XFS_ILOG_DDATA;
+        /*
+         * Copy the header into the newly allocate local space.
+         */
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count));
+        dp->i_d.di_size = size;
+        /*
+         * Set up to loop over the block's entries.
+         */
+        btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+        ptr = (char *)block->u;
+        endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp);
+        sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+        /*
+         * Loop over the active and unused entries.
+         * Stop when we reach the leaf/tail portion of the block.
+         */
+        while (ptr < endptr) {
+                /*
+                 * If it's unused, just skip over it.
+                 */
+                dup = (xfs_dir2_data_unused_t *)ptr;
+                if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+                        ptr += INT_GET(dup->length, ARCH_CONVERT);
+                        continue;
+                }
+                dep = (xfs_dir2_data_entry_t *)ptr;
+                /*
+                 * Skip .
+                 */
+                if (dep->namelen == 1 && dep->name[0] == '.')
+                        ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
+                /*
+                 * Skip .., but make sure the inode number is right.
+                 */
+                else if (dep->namelen == 2 &&
+                         dep->name[0] == '.' && dep->name[1] == '.')
+                        ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
+                               XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent));
+                /*
+                 * Normal entry, copy it into shortform.
+                 */
+                else {
+                        sfep->namelen = dep->namelen;
+                        XFS_DIR2_SF_PUT_OFFSET(sfep,
+                                (xfs_dir2_data_aoff_t)
+                                ((char *)dep - (char *)block));
+                        memcpy(sfep->name, dep->name, dep->namelen);
+                        temp=INT_GET(dep->inumber, ARCH_CONVERT);
+                        XFS_DIR2_SF_PUT_INUMBER(sfp, &temp,
+                                XFS_DIR2_SF_INUMBERP(sfep));
+                        sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+                }
+                ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+        }
+        ASSERT((char *)sfep - (char *)sfp == size);
+        xfs_dir2_sf_check(args);
+out:
+        xfs_trans_log_inode(args->trans, dp, logflags);
+        kmem_free(block, mp->m_dirblksize);
+        return error;
+}
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int                                             /* error */
+xfs_dir2_sf_addname(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        int                     add_entsize;    /* size of the new entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     error;          /* error return value */
+        int                     incr_isize;     /* total change in size */
+        int                     new_isize;      /* di_size after adding name */
+        int                     objchange;      /* changing to 8-byte inodes */
+        xfs_dir2_data_aoff_t    offset;         /* offset for new entry */
+        int                     old_isize;      /* di_size before adding name */
+        int                     pick;           /* which algorithm to use */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
+        xfs_dir2_trace_args("sf_addname", args);
+        ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Make sure the shortform value has some of its header.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+        /*
+         * Compute entry (and change in) size.
+         */
+        add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
+        incr_isize = add_entsize;
+        objchange = 0;
+#if XFS_BIG_INUMS
+        /*
+         * Do we have to change to 8 byte inodes?
+         */
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+                /*
+                 * Yes, adjust the entry size and the total size.
+                 */
+                add_entsize +=
+                        (uint)sizeof(xfs_dir2_ino8_t) -
+                        (uint)sizeof(xfs_dir2_ino4_t);
+                incr_isize +=
+                        (sfp->hdr.count + 2) *
+                        ((uint)sizeof(xfs_dir2_ino8_t) -
+                         (uint)sizeof(xfs_dir2_ino4_t));
+                objchange = 1;
+        }
+#endif
+        old_isize = (int)dp->i_d.di_size;
+        new_isize = old_isize + incr_isize;
+        /*
+         * Won't fit as shortform any more (due to size),
+         * or the pick routine says it won't (due to offset values).
+         */
+        if (new_isize > XFS_IFORK_DSIZE(dp) ||
+            (pick =
+             xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+                /*
+                 * Just checking or no space reservation, it doesn't fit.
+                 */
+                if (args->justcheck || args->total == 0)
+                        return XFS_ERROR(ENOSPC);
+                /*
+                 * Convert to block form then add the name.
+                 */
+                error = xfs_dir2_sf_to_block(args);
+                if (error)
+                        return error;
+                return xfs_dir2_block_addname(args);
+        }
+        /*
+         * Just checking, it fits.
+         */
+        if (args->justcheck)
+                return 0;
+        /*
+         * Do it the easy way - just add it at the end.
+         */
+        if (pick == 1)
+                xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+        /*
+         * Do it the hard way - look for a place to insert the new entry.
+         * Convert to 8 byte inode numbers first if necessary.
+         */
+        else {
+                ASSERT(pick == 2);
+#if XFS_BIG_INUMS
+                if (objchange)
+                        xfs_dir2_sf_toino8(args);
+#endif
+                xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+        }
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return 0;
+}
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+        xfs_da_args_t           *args,          /* operation arguments */
+        xfs_dir2_sf_entry_t     *sfep,          /* pointer to new entry */
+        xfs_dir2_data_aoff_t    offset,         /* offset to use for new ent */
+        int                     new_isize)      /* new directory size */
+{
+        int                     byteoff;        /* byte offset in sf dir */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        dp = args->dp;
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        byteoff = (int)((char *)sfep - (char *)sfp);
+        /*
+         * Grow the in-inode space.
+         */
+        xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen),
+                XFS_DATA_FORK);
+        /*
+         * Need to set up again due to realloc of the inode data.
+         */
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+        /*
+         * Fill in the new entry.
+         */
+        sfep->namelen = args->namelen;
+        XFS_DIR2_SF_PUT_OFFSET(sfep, offset);
+        memcpy(sfep->name, args->name, sfep->namelen);
+        XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
+                XFS_DIR2_SF_INUMBERP(sfep));
+        /*
+         * Update the header and inode.
+         */
+        sfp->hdr.count++;
+#if XFS_BIG_INUMS
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+                sfp->hdr.i8count++;
+#endif
+        dp->i_d.di_size = new_isize;
+        xfs_dir2_sf_check(args);
+}
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     objchange,      /* changing inode number size */
+        int                     new_isize)      /* new directory size */
+{
+        int                     add_datasize;   /* data size need for new ent */
+        char                    *buf;           /* buffer for old */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     eof;            /* reached end of old dir */
+        int                     nbytes;         /* temp for byte copies */
+        xfs_dir2_data_aoff_t    new_offset;     /* next offset value */
+        xfs_dir2_data_aoff_t    offset;         /* current offset value */
+        int                     old_isize;      /* previous di_size */
+        xfs_dir2_sf_entry_t     *oldsfep;       /* entry in original dir */
+        xfs_dir2_sf_t           *oldsfp;        /* original shortform dir */
+        xfs_dir2_sf_entry_t     *sfep;          /* entry in new dir */
+        xfs_dir2_sf_t           *sfp;           /* new shortform dir */
+        /*
+         * Copy the old directory to the stack buffer.
+         */
+        dp = args->dp;
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        old_isize = (int)dp->i_d.di_size;
+        buf = kmem_alloc(old_isize, KM_SLEEP);
+        oldsfp = (xfs_dir2_sf_t *)buf;
+        memcpy(oldsfp, sfp, old_isize);
+        /*
+         * Loop over the old directory finding the place we're going
+         * to insert the new entry.
+         * If it's going to end up at the end then oldsfep will point there.
+         */
+        for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+              oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp),
+              add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen),
+              eof = (char *)oldsfep == &buf[old_isize];
+             !eof;
+             offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen),
+              oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep),
+              eof = (char *)oldsfep == &buf[old_isize]) {
+                new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep);
+                if (offset + add_datasize <= new_offset)
+                        break;
+        }
+        /*
+         * Get rid of the old directory, then allocate space for
+         * the new one.  We do this so xfs_idata_realloc won't copy
+         * the data.
+         */
+        xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+        /*
+         * Reset the pointer since the buffer was reallocated.
+         */
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        /*
+         * Copy the first part of the directory, including the header.
+         */
+        nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+        memcpy(sfp, oldsfp, nbytes);
+        sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+        /*
+         * Fill in the new entry, and update the header counts.
+         */
+        sfep->namelen = args->namelen;
+        XFS_DIR2_SF_PUT_OFFSET(sfep, offset);
+        memcpy(sfep->name, args->name, sfep->namelen);
+        XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
+                XFS_DIR2_SF_INUMBERP(sfep));
+        sfp->hdr.count++;
+#if XFS_BIG_INUMS
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+                sfp->hdr.i8count++;
+#endif
+        /*
+         * If there's more left to copy, do that.
+         */
+        if (!eof) {
+                sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+                memcpy(sfep, oldsfep, old_isize - nbytes);
+        }
+        kmem_free(buf, old_isize);
+        dp->i_d.di_size = new_isize;
+        xfs_dir2_sf_check(args);
+}
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int                                      /* pick result */
+xfs_dir2_sf_addname_pick(
+        xfs_da_args_t           *args,          /* operation arguments */
+        int                     objchange,      /* inode # size changes */
+        xfs_dir2_sf_entry_t     **sfepp,        /* out(1): new entry ptr */
+        xfs_dir2_data_aoff_t    *offsetp)       /* out(1): new offset */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     holefit;        /* found hole it will fit in */
+        int                     i;              /* entry number */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_data_aoff_t    offset;         /* data block offset */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform entry */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        int                     size;           /* entry's data size */
+        int                     used;           /* data bytes used */
+        dp = args->dp;
+        mp = dp->i_mount;
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        size = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+        offset = XFS_DIR2_DATA_FIRST_OFFSET;
+        sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+        holefit = 0;
+        /*
+         * Loop over sf entries.
+         * Keep track of data offset and whether we've seen a place
+         * to insert the new entry.
+         */
+        for (i = 0; i < sfp->hdr.count; i++) {
+                if (!holefit)
+                        holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep);
+                offset = XFS_DIR2_SF_GET_OFFSET(sfep) +
+                         XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
+                sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+        }
+        /*
+         * Calculate data bytes used excluding the new entry, if this
+         * was a data block (block form directory).
+         */
+        used = offset +
+               (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+               (uint)sizeof(xfs_dir2_block_tail_t);
+        /*
+         * If it won't fit in a block form then we can't insert it,
+         * we'll go back, convert to block, then try the insert and convert
+         * to leaf.
+         */
+        if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+                return 0;
+        /*
+         * If changing the inode number size, do it the hard way.
+         */
+#if XFS_BIG_INUMS
+        if (objchange) {
+                return 2;
+        }
+#else
+        ASSERT(objchange == 0);
+#endif
+        /*
+         * If it won't fit at the end then do it the hard way (use the hole).
+         */
+        if (used + size > mp->m_dirblksize)
+                return 2;
+        /*
+         * Do it the easy way.
+         */
+        *sfepp = sfep;
+        *offsetp = offset;
+        return 1;
+}
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry number */
+        int                     i8count;        /* number of big inode#s */
+        xfs_ino_t               ino;            /* entry inode number */
+        int                     offset;         /* data offset */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform dir entry */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        dp = args->dp;
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        offset = XFS_DIR2_DATA_FIRST_OFFSET;
+        ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
+        i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+        for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+             i < sfp->hdr.count;
+             i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+                ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset);
+                ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep));
+                i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+                offset =
+                        XFS_DIR2_SF_GET_OFFSET(sfep) +
+                        XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
+        }
+        ASSERT(i8count == sfp->hdr.i8count);
+        ASSERT(XFS_BIG_INUMS || i8count == 0);
+        ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+        ASSERT(offset +
+               (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+               (uint)sizeof(xfs_dir2_block_tail_t) <=
+               dp->i_mount->m_dirblksize);
+}
+#endif  /* DEBUG */
+/*
+ * Create a new (shortform) directory.
+ */
+int                                     /* error, always 0 */
+xfs_dir2_sf_create(
+        xfs_da_args_t   *args,          /* operation arguments */
+        xfs_ino_t       pino)           /* parent inode number */
+{
+        xfs_inode_t     *dp;            /* incore directory inode */
+        int             i8count;        /* parent inode is an 8-byte number */
+        xfs_dir2_sf_t   *sfp;           /* shortform structure */
+        int             size;           /* directory size */
+        xfs_dir2_trace_args_i("sf_create", args, pino);
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        ASSERT(dp->i_d.di_size == 0);
+        /*
+         * If it's currently a zero-length extent file,
+         * convert it to local format.
+         */
+        if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+                dp->i_df.if_flags &= ~XFS_IFEXTENTS;    /* just in case */
+                dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+                dp->i_df.if_flags |= XFS_IFINLINE;
+        }
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        ASSERT(dp->i_df.if_bytes == 0);
+        i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+        size = XFS_DIR2_SF_HDR_SIZE(i8count);
+        /*
+         * Make a buffer for the data.
+         */
+        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        /*
+         * Fill in the header,
+         */
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        sfp->hdr.i8count = i8count;
+        /*
+         * Now can put in the inode number, since i8count is set.
+         */
+        XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent);
+        sfp->hdr.count = 0;
+        dp->i_d.di_size = size;
+        xfs_dir2_sf_check(args);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return 0;
+}
+int                                             /* error */
+xfs_dir2_sf_getdents(
+        xfs_inode_t             *dp,            /* incore directory inode */
+        uio_t                   *uio,           /* caller's buffer control */
+        int                     *eofp,          /* eof reached? (out) */
+        xfs_dirent_t            *dbp,           /* caller's buffer */
+        xfs_dir2_put_t          put)            /* abi's formatting function */
+{
+        int                     error;          /* error return value */
+        int                     i;              /* shortform entry number */
+        xfs_mount_t             *mp;            /* filesystem mount point */
+        xfs_dir2_dataptr_t      off;            /* current entry's offset */
+        xfs_dir2_put_args_t     p;              /* arg package for put rtn */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        xfs_off_t                       dir_offset;
+        mp = dp->i_mount;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Give up if the directory is way too short.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                return XFS_ERROR(EIO);
+        }
+        dir_offset = uio->uio_offset;
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+        /*
+         * If the block number in the offset is out of range, we're done.
+         */
+        if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) {
+                *eofp = 1;
+                return 0;
+        }
+        /*
+         * Set up putargs structure.
+         */
+        p.dbp = dbp;
+        p.put = put;
+        p.uio = uio;
+        /*
+         * Put . entry unless we're starting past it.
+         */
+        if (dir_offset <=
+                    XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                               XFS_DIR2_DATA_DOT_OFFSET)) {
+                p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0,
+                                                XFS_DIR2_DATA_DOTDOT_OFFSET);
+                p.ino = dp->i_ino;
+#if XFS_BIG_INUMS
+                p.ino += mp->m_inoadd;
+#endif
+                p.name = ".";
+                p.namelen = 1;
+                error = p.put(&p);
+                if (!p.done) {
+                        uio->uio_offset =
+                                XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                                XFS_DIR2_DATA_DOT_OFFSET);
+                        return error;
+                }
+        }
+        /*
+         * Put .. entry unless we're starting past it.
+         */
+        if (dir_offset <=
+                    XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                               XFS_DIR2_DATA_DOTDOT_OFFSET)) {
+                p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                                XFS_DIR2_DATA_FIRST_OFFSET);
+                p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
+#if XFS_BIG_INUMS
+                p.ino += mp->m_inoadd;
+#endif
+                p.name = "..";
+                p.namelen = 2;
+                error = p.put(&p);
+                if (!p.done) {
+                        uio->uio_offset =
+                                XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                        XFS_DIR2_DATA_DOTDOT_OFFSET);
+                        return error;
+                }
+        }
+        /*
+         * Loop while there are more entries and put'ing works.
+         */
+        for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+                     i < sfp->hdr.count;
+                             i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+                off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                                XFS_DIR2_SF_GET_OFFSET(sfep));
+                if (dir_offset > off)
+                        continue;
+                p.namelen = sfep->namelen;
+                p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+                        XFS_DIR2_SF_GET_OFFSET(sfep) +
+                        XFS_DIR2_DATA_ENTSIZE(p.namelen));
+                p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep));
+#if XFS_BIG_INUMS
+                p.ino += mp->m_inoadd;
+#endif
+                p.name = (char *)sfep->name;
+                error = p.put(&p);
+                if (!p.done) {
+                        uio->uio_offset = off;
+                        return error;
+                }
+        }
+        /*
+         * They all fit.
+         */
+        *eofp = 1;
+        uio->uio_offset =
+                XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
+        return 0;
+}
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int                                             /* error */
+xfs_dir2_sf_lookup(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        xfs_dir2_trace_args("sf_lookup", args);
+        xfs_dir2_sf_check(args);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Bail out if the directory is way too short.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+        /*
+         * Special case for .
+         */
+        if (args->namelen == 1 && args->name[0] == '.') {
+                args->inumber = dp->i_ino;
+                return XFS_ERROR(EEXIST);
+        }
+        /*
+         * Special case for ..
+         */
+        if (args->namelen == 2 &&
+            args->name[0] == '.' && args->name[1] == '.') {
+                args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
+                return XFS_ERROR(EEXIST);
+        }
+        /*
+         * Loop over all the entries trying to match ours.
+         */
+        for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+             i < sfp->hdr.count;
+             i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+                if (sfep->namelen == args->namelen &&
+                    sfep->name[0] == args->name[0] &&
+                    memcmp(args->name, sfep->name, args->namelen) == 0) {
+                        args->inumber =
+                                XFS_DIR2_SF_GET_INUMBER(sfp,
+                                        XFS_DIR2_SF_INUMBERP(sfep));
+                        return XFS_ERROR(EEXIST);
+                }
+        }
+        /*
+         * Didn't find it.
+         */
+        ASSERT(args->oknoent);
+        return XFS_ERROR(ENOENT);
+}
+/*
+ * Remove an entry from a shortform directory.
+ */
+int                                             /* error */
+xfs_dir2_sf_removename(
+        xfs_da_args_t           *args)
+{
+        int                     byteoff;        /* offset of removed entry */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     entsize;        /* this entry's size */
+        int                     i;              /* shortform entry index */
+        int                     newsize;        /* new inode size */
+        int                     oldsize;        /* old inode size */
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        xfs_dir2_trace_args("sf_removename", args);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        oldsize = (int)dp->i_d.di_size;
+        /*
+         * Bail out if the directory is way too short.
+         */
+        if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == oldsize);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+        /*
+         * Loop over the old directory entries.
+         * Find the one we're deleting.
+         */
+        for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+             i < sfp->hdr.count;
+             i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+                if (sfep->namelen == args->namelen &&
+                    sfep->name[0] == args->name[0] &&
+                    memcmp(sfep->name, args->name, args->namelen) == 0) {
+                        ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp,
+                                        XFS_DIR2_SF_INUMBERP(sfep)) ==
+                                args->inumber);
+                        break;
+                }
+        }
+        /*
+         * Didn't find it.
+         */
+        if (i == sfp->hdr.count) {
+                return XFS_ERROR(ENOENT);
+        }
+        /*
+         * Calculate sizes.
+         */
+        byteoff = (int)((char *)sfep - (char *)sfp);
+        entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
+        newsize = oldsize - entsize;
+        /*
+         * Copy the part if any after the removed entry, sliding it down.
+         */
+        if (byteoff + entsize < oldsize)
+                memmove((char *)sfp + byteoff, (char *)sfp + byteoff + entsize,
+                        oldsize - (byteoff + entsize));
+        /*
+         * Fix up the header and file size.
+         */
+        sfp->hdr.count--;
+        dp->i_d.di_size = newsize;
+        /*
+         * Reallocate, making it smaller.
+         */
+        xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+#if XFS_BIG_INUMS
+        /*
+         * Are we changing inode number size?
+         */
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+                if (sfp->hdr.i8count == 1)
+                        xfs_dir2_sf_toino4(args);
+                else
+                        sfp->hdr.i8count--;
+        }
+#endif
+        xfs_dir2_sf_check(args);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return 0;
+}
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int                                             /* error */
+xfs_dir2_sf_replace(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+#if XFS_BIG_INUMS || defined(DEBUG)
+        xfs_ino_t               ino=0;          /* entry old inode number */
+#endif
+#if XFS_BIG_INUMS
+        int                     i8elevated;     /* sf_toino8 set i8count=1 */
+#endif
+        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
+        xfs_dir2_sf_t           *sfp;           /* shortform structure */
+        xfs_dir2_trace_args("sf_replace", args);
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Bail out if the shortform directory is way too small.
+         */
+        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+#if XFS_BIG_INUMS
+        /*
+         * New inode number is large, and need to convert to 8-byte inodes.
+         */
+        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+                int     error;                  /* error return value */
+                int     newsize;                /* new inode size */
+                newsize =
+                        dp->i_df.if_bytes +
+                        (sfp->hdr.count + 1) *
+                        ((uint)sizeof(xfs_dir2_ino8_t) -
+                         (uint)sizeof(xfs_dir2_ino4_t));
+                /*
+                 * Won't fit as shortform, convert to block then do replace.
+                 */
+                if (newsize > XFS_IFORK_DSIZE(dp)) {
+                        error = xfs_dir2_sf_to_block(args);
+                        if (error) {
+                                return error;
+                        }
+                        return xfs_dir2_block_replace(args);
+                }
+                /*
+                 * Still fits, convert to 8-byte now.
+                 */
+                xfs_dir2_sf_toino8(args);
+                i8elevated = 1;
+                sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        } else
+                i8elevated = 0;
+#endif
+        ASSERT(args->namelen != 1 || args->name[0] != '.');
+        /*
+         * Replace ..'s entry.
+         */
+        if (args->namelen == 2 &&
+            args->name[0] == '.' && args->name[1] == '.') {
+#if XFS_BIG_INUMS || defined(DEBUG)
+                ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent);
+                ASSERT(args->inumber != ino);
+#endif
+                XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent);
+        }
+        /*
+         * Normal entry, look for the name.
+         */
+        else {
+                for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+                     i < sfp->hdr.count;
+                     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+                        if (sfep->namelen == args->namelen &&
+                            sfep->name[0] == args->name[0] &&
+                            memcmp(args->name, sfep->name, args->namelen) == 0) {
+#if XFS_BIG_INUMS || defined(DEBUG)
+                                ino = XFS_DIR2_SF_GET_INUMBER(sfp,
+                                        XFS_DIR2_SF_INUMBERP(sfep));
+                                ASSERT(args->inumber != ino);
+#endif
+                                XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber,
+                                        XFS_DIR2_SF_INUMBERP(sfep));
+                                break;
+                        }
+                }
+                /*
+                 * Didn't find it.
+                 */
+                if (i == sfp->hdr.count) {
+                        ASSERT(args->oknoent);
+#if XFS_BIG_INUMS
+                        if (i8elevated)
+                                xfs_dir2_sf_toino4(args);
+#endif
+                        return XFS_ERROR(ENOENT);
+                }
+        }
+#if XFS_BIG_INUMS
+        /*
+         * See if the old number was large, the new number is small.
+         */
+        if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+            args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+                /*
+                 * And the old count was one, so need to convert to small.
+                 */
+                if (sfp->hdr.i8count == 1)
+                        xfs_dir2_sf_toino4(args);
+                else
+                        sfp->hdr.i8count--;
+        }
+        /*
+         * See if the old number was small, the new number is large.
+         */
+        if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+            args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+                /*
+                 * add to the i8count unless we just converted to 8-byte
+                 * inodes (which does an implied i8count = 1)
+                 */
+                ASSERT(sfp->hdr.i8count != 0);
+                if (!i8elevated)
+                        sfp->hdr.i8count++;
+        }
+#endif
+        xfs_dir2_sf_check(args);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+        return 0;
+}
+#if XFS_BIG_INUMS
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        char                    *buf;           /* old dir's buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+        xfs_ino_t               ino;            /* entry inode number */
+        int                     newsize;        /* new inode size */
+        xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
+        xfs_dir2_sf_t           *oldsfp;        /* old sf directory */
+        int                     oldsize;        /* old inode size */
+        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
+        xfs_dir2_sf_t           *sfp;           /* new sf directory */
+        xfs_dir2_trace_args("sf_toino4", args);
+        dp = args->dp;
+        /*
+         * Copy the old directory to the buffer.
+         * Then nuke it from the inode, and add the new buffer to the inode.
+         * Don't want xfs_idata_realloc copying the data here.
+         */
+        oldsize = dp->i_df.if_bytes;
+        buf = kmem_alloc(oldsize, KM_SLEEP);
+        oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(oldsfp->hdr.i8count == 1);
+        memcpy(buf, oldsfp, oldsize);
+        /*
+         * Compute the new inode size.
+         */
+        newsize =
+                oldsize -
+                (oldsfp->hdr.count + 1) *
+                ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+        /*
+         * Reset our pointers, the data has moved.
+         */
+        oldsfp = (xfs_dir2_sf_t *)buf;
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        /*
+         * Fill in the new header.
+         */
+        sfp->hdr.count = oldsfp->hdr.count;
+        sfp->hdr.i8count = 0;
+        ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent);
+        XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent);
+        /*
+         * Copy the entries field by field.
+         */
+        for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
+                    oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
+             i < sfp->hdr.count;
+             i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
+                  oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+                sfep->namelen = oldsfep->namelen;
+                sfep->offset = oldsfep->offset;
+                memcpy(sfep->name, oldsfep->name, sfep->namelen);
+                ino = XFS_DIR2_SF_GET_INUMBER(oldsfp,
+                        XFS_DIR2_SF_INUMBERP(oldsfep));
+                XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep));
+        }
+        /*
+         * Clean up the inode.
+         */
+        kmem_free(buf, oldsize);
+        dp->i_d.di_size = newsize;
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+/*
+ * Convert from 4-byte inode numbers to 8-byte inode numbers.
+ * The new 8-byte inode number is not there yet, we leave with the
+ * count 1 but no corresponding entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+        xfs_da_args_t           *args)          /* operation arguments */
+{
+        char                    *buf;           /* old dir's buffer */
+        xfs_inode_t             *dp;            /* incore directory inode */
+        int                     i;              /* entry index */
+        xfs_ino_t               ino;            /* entry inode number */
+        int                     newsize;        /* new inode size */
+        xfs_dir2_sf_entry_t     *oldsfep;       /* old sf entry */
+        xfs_dir2_sf_t           *oldsfp;        /* old sf directory */
+        int                     oldsize;        /* old inode size */
+        xfs_dir2_sf_entry_t     *sfep;          /* new sf entry */
+        xfs_dir2_sf_t           *sfp;           /* new sf directory */
+        xfs_dir2_trace_args("sf_toino8", args);
+        dp = args->dp;
+        /*
+         * Copy the old directory to the buffer.
+         * Then nuke it from the inode, and add the new buffer to the inode.
+         * Don't want xfs_idata_realloc copying the data here.
+         */
+        oldsize = dp->i_df.if_bytes;
+        buf = kmem_alloc(oldsize, KM_SLEEP);
+        oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        ASSERT(oldsfp->hdr.i8count == 0);
+        memcpy(buf, oldsfp, oldsize);
+        /*
+         * Compute the new inode size.
+         */
+        newsize =
+                oldsize +
+                (oldsfp->hdr.count + 1) *
+                ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+        xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+        xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+        /*
+         * Reset our pointers, the data has moved.
+         */
+        oldsfp = (xfs_dir2_sf_t *)buf;
+        sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+        /*
+         * Fill in the new header.
+         */
+        sfp->hdr.count = oldsfp->hdr.count;
+        sfp->hdr.i8count = 1;
+        ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent);
+        XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent);
+        /*
+         * Copy the entries field by field.
+         */
+        for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
+                    oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
+             i < sfp->hdr.count;
+             i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
+                  oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+                sfep->namelen = oldsfep->namelen;
+                sfep->offset = oldsfep->offset;
+                memcpy(sfep->name, oldsfep->name, sfep->namelen);
+                ino = XFS_DIR2_SF_GET_INUMBER(oldsfp,
+                        XFS_DIR2_SF_INUMBERP(oldsfep));
+                XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep));
+        }
+        /*
+         * Clean up the inode.
+         */
+        kmem_free(buf, oldsize);
+        dp->i_d.di_size = newsize;
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+#endif  /* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
new file mode 100644
index 000000000000..bac6f5a2a312
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_SF_H__
+#define __XFS_DIR2_SF_H__
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_block;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * Maximum size of a shortform directory.
+ */
+#define XFS_DIR2_SF_MAX_SIZE    \
+        (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
+         (uint)sizeof(xfs_agino_t))
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+typedef union {
+        xfs_dir2_ino8_t i8;
+        xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tightly as possible.  The header
+ * and the elements must be memcpy'd out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir2_sf_hdr {
+        __uint8_t               count;          /* count of entries */
+        __uint8_t               i8count;        /* count of 8-byte inode #s */
+        xfs_dir2_inou_t         parent;         /* parent dir inode number */
+} xfs_dir2_sf_hdr_t;
+typedef struct xfs_dir2_sf_entry {
+        __uint8_t               namelen;        /* actual name length */
+        xfs_dir2_sf_off_t       offset;         /* saved offset */
+        __uint8_t               name[1];        /* name, variable size */
+        xfs_dir2_inou_t         inumber;        /* inode number, var. offset */
+} xfs_dir2_sf_entry_t;
+typedef struct xfs_dir2_sf {
+        xfs_dir2_sf_hdr_t       hdr;            /* shortform header */
+        xfs_dir2_sf_entry_t     list[1];        /* shortform entries */
+} xfs_dir2_sf_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_HDR_SIZE)
+int xfs_dir2_sf_hdr_size(int i8count);
+#define XFS_DIR2_SF_HDR_SIZE(i8count)   xfs_dir2_sf_hdr_size(i8count)
+#else
+#define XFS_DIR2_SF_HDR_SIZE(i8count)   \
+        ((uint)sizeof(xfs_dir2_sf_hdr_t) - \
+         ((i8count) == 0) * \
+         ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_INUMBERP)
+xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_INUMBERP(sfep)      xfs_dir2_sf_inumberp(sfep)
+#else
+#define XFS_DIR2_SF_INUMBERP(sfep)      \
+        ((xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_INUMBER)
+xfs_intino_t xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from);
+#define XFS_DIR2_SF_GET_INUMBER(sfp, from)      \
+        xfs_dir2_sf_get_inumber(sfp, from)
+#else
+#define XFS_DIR2_SF_GET_INUMBER(sfp, from)      \
+        ((sfp)->hdr.i8count == 0 ? \
+                (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \
+                (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
+void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
+                                     xfs_dir2_inou_t *to);
+#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to)    \
+        xfs_dir2_sf_put_inumber(sfp,from,to)
+#else
+#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to)    \
+        if ((sfp)->hdr.i8count == 0) { \
+                XFS_PUT_DIR_INO4(*(from), (to)->i4); \
+        } else { \
+                XFS_PUT_DIR_INO8(*(from), (to)->i8); \
+        }
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_OFFSET)
+xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_GET_OFFSET(sfep)    \
+        xfs_dir2_sf_get_offset(sfep)
+#else
+#define XFS_DIR2_SF_GET_OFFSET(sfep)    \
+        INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
+void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep,
+                                    xfs_dir2_data_aoff_t off);
+#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \
+        xfs_dir2_sf_put_offset(sfep,off)
+#else
+#define XFS_DIR2_SF_PUT_OFFSET(sfep,off)        \
+        INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i,off)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
+int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len);
+#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len)     \
+        xfs_dir2_sf_entsize_byname(sfp,len)
+#else
+#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len)     /* space a name uses */ \
+        ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
+         ((sfp)->hdr.i8count == 0) * \
+         ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
+int xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)   \
+        xfs_dir2_sf_entsize_byentry(sfp,sfep)
+#else
+#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)   /* space an entry uses */ \
+        ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \
+         ((sfp)->hdr.i8count == 0) * \
+         ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
+xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp);
+#define XFS_DIR2_SF_FIRSTENTRY(sfp)     xfs_dir2_sf_firstentry(sfp)
+#else
+#define XFS_DIR2_SF_FIRSTENTRY(sfp)     /* first entry in struct */ \
+        ((xfs_dir2_sf_entry_t *) \
+         ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_NEXTENTRY)
+xfs_dir2_sf_entry_t *xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp,
+                                           xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep)         xfs_dir2_sf_nextentry(sfp,sfep)
+#else
+#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep)         /* next entry in struct */ \
+        ((xfs_dir2_sf_entry_t *) \
+                ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)))
+#endif
+/*
+ * Functions.
+ */
+extern int
+        xfs_dir2_block_sfsize(struct xfs_inode *dp,
+                              struct xfs_dir2_block *block,
+                              xfs_dir2_sf_hdr_t *sfhp);
+extern int
+        xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
+                             int size, xfs_dir2_sf_hdr_t *sfhp);
+extern int
+        xfs_dir2_sf_addname(struct xfs_da_args *args);
+extern int
+        xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+extern int
+        xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
+                             struct xfs_dirent *dbp, xfs_dir2_put_t put);
+extern int
+        xfs_dir2_sf_lookup(struct xfs_da_args *args);
+extern int
+        xfs_dir2_sf_removename(struct xfs_da_args *args);
+extern int
+        xfs_dir2_sf_replace(struct xfs_da_args *args);
+#endif  /* __XFS_DIR2_SF_H__ */
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
new file mode 100644
index 000000000000..9d6417393140
--- /dev/null
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_dir2_trace.c
+ * Tracing for xfs v2 directories.
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_trace.h"
+#ifdef XFS_DIR2_TRACE
+ktrace_t        *xfs_dir2_trace_buf;
+/*
+ * Enter something in the trace buffers.
+ */
+static void
+xfs_dir2_trace_enter(
+        xfs_inode_t     *dp,
+        int             type,
+        char            *where,
+        char            *name,
+        int             namelen,
+        void            *a0,
+        void            *a1,
+        void            *a2,
+        void            *a3,
+        void            *a4,
+        void            *a5,
+        void            *a6,
+        void            *a7)
+{
+        void            *n[5];
+        ASSERT(xfs_dir2_trace_buf);
+        ASSERT(dp->i_dir_trace);
+        if (name)
+                memcpy(n, name, min((int)sizeof(n), namelen));
+        else
+                memset((char *)n, 0, sizeof(n));
+        ktrace_enter(xfs_dir2_trace_buf,
+                (void *)(long)type, (void *)where,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)(long)namelen,
+                (void *)n[0], (void *)n[1], (void *)n[2],
+                (void *)n[3], (void *)n[4]);
+        ktrace_enter(dp->i_dir_trace,
+                (void *)(long)type, (void *)where,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)(long)namelen,
+                (void *)n[0], (void *)n[1], (void *)n[2],
+                (void *)n[3], (void *)n[4]);
+}
+void
+xfs_dir2_trace_args(
+        char            *where,
+        xfs_da_args_t   *args)
+{
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)(unsigned long)args->hashval,
+                (void *)((unsigned long)(args->inumber >> 32)),
+                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
+                (void *)args->dp, (void *)args->trans,
+                (void *)(unsigned long)args->justcheck, NULL, NULL);
+}
+void
+xfs_dir2_trace_args_b(
+        char            *where,
+        xfs_da_args_t   *args,
+        xfs_dabuf_t     *bp)
+{
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)(unsigned long)args->hashval,
+                (void *)((unsigned long)(args->inumber >> 32)),
+                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
+                (void *)args->dp, (void *)args->trans,
+                (void *)(unsigned long)args->justcheck,
+                (void *)(bp ? bp->bps[0] : NULL), NULL);
+}
+void
+xfs_dir2_trace_args_bb(
+        char            *where,
+        xfs_da_args_t   *args,
+        xfs_dabuf_t     *lbp,
+        xfs_dabuf_t     *dbp)
+{
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)(unsigned long)args->hashval,
+                (void *)((unsigned long)(args->inumber >> 32)),
+                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
+                (void *)args->dp, (void *)args->trans,
+                (void *)(unsigned long)args->justcheck,
+                (void *)(lbp ? lbp->bps[0] : NULL),
+                (void *)(dbp ? dbp->bps[0] : NULL));
+}
+void
+xfs_dir2_trace_args_bibii(
+        char            *where,
+        xfs_da_args_t   *args,
+        xfs_dabuf_t     *bs,
+        int             ss,
+        xfs_dabuf_t     *bd,
+        int             sd,
+        int             c)
+{
+        xfs_buf_t       *bpbs = bs ? bs->bps[0] : NULL;
+        xfs_buf_t       *bpbd = bd ? bd->bps[0] : NULL;
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)args->dp, (void *)args->trans,
+                (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd,
+                (void *)(long)c, NULL);
+}
+void
+xfs_dir2_trace_args_db(
+        char            *where,
+        xfs_da_args_t   *args,
+        xfs_dir2_db_t   db,
+        xfs_dabuf_t     *bp)
+{
+        xfs_buf_t       *dbp = bp ? bp->bps[0] : NULL;
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)(unsigned long)args->hashval,
+                (void *)((unsigned long)(args->inumber >> 32)),
+                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
+                (void *)args->dp, (void *)args->trans,
+                (void *)(unsigned long)args->justcheck, (void *)(long)db,
+                (void *)dbp);
+}
+void
+xfs_dir2_trace_args_i(
+        char            *where,
+        xfs_da_args_t   *args,
+        xfs_ino_t       i)
+{
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)(unsigned long)args->hashval,
+                (void *)((unsigned long)(args->inumber >> 32)),
+                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
+                (void *)args->dp, (void *)args->trans,
+                (void *)(unsigned long)args->justcheck,
+                (void *)((unsigned long)(i >> 32)),
+                (void *)((unsigned long)(i & 0xFFFFFFFF)));
+}
+void
+xfs_dir2_trace_args_s(
+        char            *where,
+        xfs_da_args_t   *args,
+        int             s)
+{
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)(unsigned long)args->hashval,
+                (void *)((unsigned long)(args->inumber >> 32)),
+                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
+                (void *)args->dp, (void *)args->trans,
+                (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL);
+}
+void
+xfs_dir2_trace_args_sb(
+        char            *where,
+        xfs_da_args_t   *args,
+        int             s,
+        xfs_dabuf_t     *bp)
+{
+        xfs_buf_t       *dbp = bp ? bp->bps[0] : NULL;
+        xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
+                (char *)args->name, (int)args->namelen,
+                (void *)(unsigned long)args->hashval,
+                (void *)((unsigned long)(args->inumber >> 32)),
+                (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
+                (void *)args->dp, (void *)args->trans,
+                (void *)(unsigned long)args->justcheck, (void *)(long)s,
+                (void *)dbp);
+}
+#endif  /* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h
new file mode 100644
index 000000000000..0a178bffa806
--- /dev/null
+++ b/fs/xfs/xfs_dir2_trace.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_TRACE_H__
+#define __XFS_DIR2_TRACE_H__
+/*
+ * Tracing for xfs v2 directories.
+ */
+#if defined(XFS_DIR2_TRACE)
+struct ktrace;
+struct xfs_dabuf;
+struct xfs_da_args;
+#define XFS_DIR2_GTRACE_SIZE            4096    /* global buffer */
+#define XFS_DIR2_KTRACE_SIZE            32      /* per-inode buffer */
+extern struct ktrace *xfs_dir2_trace_buf;
+#define XFS_DIR2_KTRACE_ARGS            1       /* args only */
+#define XFS_DIR2_KTRACE_ARGS_B          2       /* args + buffer */
+#define XFS_DIR2_KTRACE_ARGS_BB         3       /* args + 2 buffers */
+#define XFS_DIR2_KTRACE_ARGS_DB         4       /* args, db, buffer */
+#define XFS_DIR2_KTRACE_ARGS_I          5       /* args, inum */
+#define XFS_DIR2_KTRACE_ARGS_S          6       /* args, int */
+#define XFS_DIR2_KTRACE_ARGS_SB         7       /* args, int, buffer */
+#define XFS_DIR2_KTRACE_ARGS_BIBII      8       /* args, buf/int/buf/int/int */
+void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
+void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
+                           struct xfs_dabuf *bp);
+void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
+                            struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
+void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
+                               struct xfs_dabuf *bs, int ss,
+                               struct xfs_dabuf *bd, int sd, int c);
+void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
+                            xfs_dir2_db_t db, struct xfs_dabuf *bp);
+void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
+void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
+void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
+                            struct xfs_dabuf *bp);
+#else   /* XFS_DIR2_TRACE */
+#define xfs_dir2_trace_args(where, args)
+#define xfs_dir2_trace_args_b(where, args, bp)
+#define xfs_dir2_trace_args_bb(where, args, lbp, dbp)
+#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
+#define xfs_dir2_trace_args_db(where, args, db, bp)
+#define xfs_dir2_trace_args_i(where, args, i)
+#define xfs_dir2_trace_args_s(where, args, s)
+#define xfs_dir2_trace_args_sb(where, args, s, bp)
+#endif  /* XFS_DIR2_TRACE */
+#endif  /* __XFS_DIR2_TRACE_H__ */
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
new file mode 100644
index 000000000000..617018d6bbdc
--- /dev/null
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -0,0 +1,2231 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_dir_leaf.c
+ *
+ * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_error.h"
+/*
+ * xfs_dir_leaf.c
+ *
+ * Routines to implement leaf blocks of directories as Btrees of hashed names.
+ */
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+                                              int insertion_index,
+                                              int freemap_index);
+STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
+                                            int musthave, int justcheck);
+STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
+                                                  xfs_da_state_blk_t *blk1,
+                                                  xfs_da_state_blk_t *blk2);
+STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
+                                          xfs_da_state_blk_t *leaf_blk_1,
+                                          xfs_da_state_blk_t *leaf_blk_2,
+                                          int *number_entries_in_blk1,
+                                          int *number_namebytes_in_blk1);
+/*
+ * Utility routines.
+ */
+STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
+                                              int src_start,
+                                              xfs_dir_leafblock_t *dst_leaf,
+                                              int dst_start, int move_count,
+                                              xfs_mount_t *mp);
+/*========================================================================
+ * External routines when dirsize < XFS_IFORK_DSIZE(dp).
+ *========================================================================*/
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
+{
+        xfs_agblock_t   agblkno;
+        xfs_agino_t     agino;
+        xfs_agnumber_t  agno;
+        int             ino_ok;
+        int             ioff;
+        agno = XFS_INO_TO_AGNO(mp, ino);
+        agblkno = XFS_INO_TO_AGBNO(mp, ino);
+        ioff = XFS_INO_TO_OFFSET(mp, ino);
+        agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+        ino_ok =
+                agno < mp->m_sb.sb_agcount &&
+                agblkno < mp->m_sb.sb_agblocks &&
+                agblkno != 0 &&
+                ioff < (1 << mp->m_sb.sb_inopblog) &&
+                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+                        XFS_RANDOM_DIR_INO_VALIDATE))) {
+                xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+                                (unsigned long long) ino);
+                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+/*
+ * Create the initial contents of a shortform directory.
+ */
+int
+xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
+{
+        xfs_dir_sf_hdr_t *hdr;
+        xfs_inode_t *dp;
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        ASSERT(dp->i_d.di_size == 0);
+        if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+                dp->i_df.if_flags &= ~XFS_IFEXTENTS;    /* just in case */
+                dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+                dp->i_df.if_flags |= XFS_IFINLINE;
+        }
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        ASSERT(dp->i_df.if_bytes == 0);
+        xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
+        hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
+        XFS_DIR_SF_PUT_DIRINO(&parent, &hdr->parent);
+        hdr->count = 0;
+        dp->i_d.di_size = sizeof(*hdr);
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return(0);
+}
+/*
+ * Add a name to the shortform directory structure.
+ * Overflow from the inode has already been checked for.
+ */
+int
+xfs_dir_shortform_addname(xfs_da_args_t *args)
+{
+        xfs_dir_shortform_t *sf;
+        xfs_dir_sf_entry_t *sfe;
+        int i, offset, size;
+        xfs_inode_t *dp;
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Catch the case where the conversion from shortform to leaf
+         * failed part way through.
+         */
+        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+                if (sfe->namelen == args->namelen &&
+                    args->name[0] == sfe->name[0] &&
+                    memcmp(args->name, sfe->name, args->namelen) == 0)
+                        return(XFS_ERROR(EEXIST));
+                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+        }
+        offset = (int)((char *)sfe - (char *)sf);
+        size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
+        xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+        sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
+        XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
+        sfe->namelen = args->namelen;
+        memcpy(sfe->name, args->name, sfe->namelen);
+        INT_MOD(sf->hdr.count, ARCH_CONVERT, +1);
+        dp->i_d.di_size += size;
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return(0);
+}
+/*
+ * Remove a name from the shortform directory structure.
+ */
+int
+xfs_dir_shortform_removename(xfs_da_args_t *args)
+{
+        xfs_dir_shortform_t *sf;
+        xfs_dir_sf_entry_t *sfe;
+        int base, size = 0, i;
+        xfs_inode_t *dp;
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Catch the case where the conversion from shortform to leaf
+         * failed part way through.
+         */
+        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        base = sizeof(xfs_dir_sf_hdr_t);
+        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+        sfe = &sf->list[0];
+        for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+                size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
+                if (sfe->namelen == args->namelen &&
+                    sfe->name[0] == args->name[0] &&
+                    memcmp(sfe->name, args->name, args->namelen) == 0)
+                        break;
+                base += size;
+                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+        }
+        if (i < 0) {
+                ASSERT(args->oknoent);
+                return(XFS_ERROR(ENOENT));
+        }
+        if ((base + size) != dp->i_d.di_size) {
+                memmove(&((char *)sf)[base], &((char *)sf)[base+size],
+                                              dp->i_d.di_size - (base+size));
+        }
+        INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
+        xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
+        dp->i_d.di_size -= size;
+        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+        return(0);
+}
+/*
+ * Look up a name in a shortform directory structure.
+ */
+int
+xfs_dir_shortform_lookup(xfs_da_args_t *args)
+{
+        xfs_dir_shortform_t *sf;
+        xfs_dir_sf_entry_t *sfe;
+        int i;
+        xfs_inode_t *dp;
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Catch the case where the conversion from shortform to leaf
+         * failed part way through.
+         */
+        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+        if (args->namelen == 2 &&
+            args->name[0] == '.' && args->name[1] == '.') {
+                XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &args->inumber);
+                return(XFS_ERROR(EEXIST));
+        }
+        if (args->namelen == 1 && args->name[0] == '.') {
+                args->inumber = dp->i_ino;
+                return(XFS_ERROR(EEXIST));
+        }
+        sfe = &sf->list[0];
+        for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+                if (sfe->namelen == args->namelen &&
+                    sfe->name[0] == args->name[0] &&
+                    memcmp(args->name, sfe->name, args->namelen) == 0) {
+                        XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args->inumber);
+                        return(XFS_ERROR(EEXIST));
+                }
+                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+        }
+        ASSERT(args->oknoent);
+        return(XFS_ERROR(ENOENT));
+}
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
+{
+        xfs_inode_t *dp;
+        xfs_dir_shortform_t *sf;
+        xfs_dir_sf_entry_t *sfe;
+        xfs_da_args_t args;
+        xfs_ino_t inumber;
+        char *tmpbuffer;
+        int retval, i, size;
+        xfs_dablk_t blkno;
+        xfs_dabuf_t *bp;
+        dp = iargs->dp;
+        /*
+         * Catch the case where the conversion from shortform to leaf
+         * failed part way through.
+         */
+        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        size = dp->i_df.if_bytes;
+        tmpbuffer = kmem_alloc(size, KM_SLEEP);
+        ASSERT(tmpbuffer != NULL);
+        memcpy(tmpbuffer, dp->i_df.if_u1.if_data, size);
+        sf = (xfs_dir_shortform_t *)tmpbuffer;
+        XFS_DIR_SF_GET_DIRINO(&sf->hdr.parent, &inumber);
+        xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
+        dp->i_d.di_size = 0;
+        xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
+        retval = xfs_da_grow_inode(iargs, &blkno);
+        if (retval)
+                goto out;
+        ASSERT(blkno == 0);
+        retval = xfs_dir_leaf_create(iargs, blkno, &bp);
+        if (retval)
+                goto out;
+        xfs_da_buf_done(bp);
+        args.name = ".";
+        args.namelen = 1;
+        args.hashval = xfs_dir_hash_dot;
+        args.inumber = dp->i_ino;
+        args.dp = dp;
+        args.firstblock = iargs->firstblock;
+        args.flist = iargs->flist;
+        args.total = iargs->total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = iargs->trans;
+        args.justcheck = 0;
+        args.addname = args.oknoent = 1;
+        retval = xfs_dir_leaf_addname(&args);
+        if (retval)
+                goto out;
+        args.name = "..";
+        args.namelen = 2;
+        args.hashval = xfs_dir_hash_dotdot;
+        args.inumber = inumber;
+        retval = xfs_dir_leaf_addname(&args);
+        if (retval)
+                goto out;
+        sfe = &sf->list[0];
+        for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+                args.name = (char *)(sfe->name);
+                args.namelen = sfe->namelen;
+                args.hashval = xfs_da_hashname((char *)(sfe->name),
+                                               sfe->namelen);
+                XFS_DIR_SF_GET_DIRINO(&sfe->inumber, &args.inumber);
+                retval = xfs_dir_leaf_addname(&args);
+                if (retval)
+                        goto out;
+                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+        }
+        retval = 0;
+out:
+        kmem_free(tmpbuffer, size);
+        return(retval);
+}
+STATIC int
+xfs_dir_shortform_compare(const void *a, const void *b)
+{
+        xfs_dir_sf_sort_t *sa, *sb;
+        sa = (xfs_dir_sf_sort_t *)a;
+        sb = (xfs_dir_sf_sort_t *)b;
+        if (sa->hash < sb->hash)
+                return -1;
+        else if (sa->hash > sb->hash)
+                return 1;
+        else
+                return sa->entno - sb->entno;
+}
+/*
+ * Copy out directory entries for getdents(), for shortform directories.
+ */
+/*ARGSUSED*/
+int
+xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
+                                       xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+        xfs_dir_shortform_t *sf;
+        xfs_dir_sf_entry_t *sfe;
+        int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
+        xfs_mount_t *mp;
+        xfs_dahash_t cookhash, hash;
+        xfs_dir_put_args_t p;
+        xfs_dir_sf_sort_t *sbuf, *sbp;
+        mp = dp->i_mount;
+        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+        cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+        want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
+        nsbuf = INT_GET(sf->hdr.count, ARCH_CONVERT) + 2;
+        sbsize = (nsbuf + 1) * sizeof(*sbuf);
+        sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+        xfs_dir_trace_g_du("sf: start", dp, uio);
+        /*
+         * Collect all the entries into the buffer.
+         * Entry 0 is .
+         */
+        sbp->entno = 0;
+        sbp->seqno = 0;
+        sbp->hash = xfs_dir_hash_dot;
+        sbp->ino = dp->i_ino;
+        sbp->name = ".";
+        sbp->namelen = 1;
+        sbp++;
+        /*
+         * Entry 1 is ..
+         */
+        sbp->entno = 1;
+        sbp->seqno = 0;
+        sbp->hash = xfs_dir_hash_dotdot;
+        sbp->ino = XFS_GET_DIR_INO8(sf->hdr.parent);
+        sbp->name = "..";
+        sbp->namelen = 2;
+        sbp++;
+        /*
+         * Scan the directory data for the rest of the entries.
+         */
+        for (i = 0, sfe = &sf->list[0];
+                        i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+                if (unlikely(
+                    ((char *)sfe < (char *)sf) ||
+                    ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)))) {
+                        xfs_dir_trace_g_du("sf: corrupted", dp, uio);
+                        XFS_CORRUPTION_ERROR("xfs_dir_shortform_getdents",
+                                             XFS_ERRLEVEL_LOW, mp, sfe);
+                        kmem_free(sbuf, sbsize);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                sbp->entno = i + 2;
+                sbp->seqno = 0;
+                sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
+                sbp->ino = XFS_GET_DIR_INO8(sfe->inumber);
+                sbp->name = (char *)sfe->name;
+                sbp->namelen = sfe->namelen;
+                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+                sbp++;
+        }
+        /*
+         * Sort the entries on hash then entno.
+         */
+        qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
+        /*
+         * Stuff in last entry.
+         */
+        sbp->entno = nsbuf;
+        sbp->hash = XFS_DA_MAXHASH;
+        sbp->seqno = 0;
+        /*
+         * Figure out the sequence numbers in case there's a hash duplicate.
+         */
+        for (hash = sbuf->hash, sbp = sbuf + 1;
+                                sbp < &sbuf[nsbuf + 1]; sbp++) {
+                if (sbp->hash == hash)
+                        sbp->seqno = sbp[-1].seqno + 1;
+                else
+                        hash = sbp->hash;
+        }
+        /*
+         * Set up put routine.
+         */
+        p.dbp = dbp;
+        p.put = put;
+        p.uio = uio;
+        /*
+         * Find our place.
+         */
+        for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
+                if (sbp->hash > cookhash ||
+                    (sbp->hash == cookhash && sbp->seqno >= want_entno))
+                        break;
+        }
+        /*
+         * Did we fail to find anything?  We stop at the last entry,
+         * the one we put maxhash into.
+         */
+        if (sbp == &sbuf[nsbuf]) {
+                kmem_free(sbuf, sbsize);
+                xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
+                uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
+                *eofp = 1;
+                return 0;
+        }
+        /*
+         * Loop putting entries into the user buffer.
+         */
+        while (sbp < &sbuf[nsbuf]) {
+                /*
+                 * Save the first resid in a run of equal-hashval entries
+                 * so that we can back them out if they don't all fit.
+                 */
+                if (sbp->seqno == 0 || sbp == sbuf)
+                        lastresid = uio->uio_resid;
+                XFS_PUT_COOKIE(p.cook, mp, 0, sbp[1].seqno, sbp[1].hash);
+                p.ino = sbp->ino;
+#if XFS_BIG_INUMS
+                p.ino += mp->m_inoadd;
+#endif
+                p.name = sbp->name;
+                p.namelen = sbp->namelen;
+                retval = p.put(&p);
+                if (!p.done) {
+                        uio->uio_offset =
+                                XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
+                        kmem_free(sbuf, sbsize);
+                        uio->uio_resid = lastresid;
+                        xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
+                        return retval;
+                }
+                sbp++;
+        }
+        kmem_free(sbuf, sbsize);
+        uio->uio_offset = p.cook.o;
+        *eofp = 1;
+        xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
+        return 0;
+}
+/*
+ * Look up a name in a shortform directory structure, replace the inode number.
+ */
+int
+xfs_dir_shortform_replace(xfs_da_args_t *args)
+{
+        xfs_dir_shortform_t *sf;
+        xfs_dir_sf_entry_t *sfe;
+        xfs_inode_t *dp;
+        int i;
+        dp = args->dp;
+        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+        /*
+         * Catch the case where the conversion from shortform to leaf
+         * failed part way through.
+         */
+        if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+        ASSERT(dp->i_df.if_u1.if_data != NULL);
+        sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+        if (args->namelen == 2 &&
+            args->name[0] == '.' && args->name[1] == '.') {
+                /* XXX - replace assert? */
+                XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sf->hdr.parent);
+                xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+                return(0);
+        }
+        ASSERT(args->namelen != 1 || args->name[0] != '.');
+        sfe = &sf->list[0];
+        for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+                if (sfe->namelen == args->namelen &&
+                    sfe->name[0] == args->name[0] &&
+                    memcmp(args->name, sfe->name, args->namelen) == 0) {
+                        ASSERT(memcmp((char *)&args->inumber,
+                                (char *)&sfe->inumber, sizeof(xfs_ino_t)));
+                        XFS_DIR_SF_PUT_DIRINO(&args->inumber, &sfe->inumber);
+                        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+                        return(0);
+                }
+                sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+        }
+        ASSERT(args->oknoent);
+        return(XFS_ERROR(ENOENT));
+}
+/*
+ * Convert a leaf directory to shortform structure
+ */
+int
+xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_hdr_t *hdr;
+        xfs_dir_leaf_entry_t *entry;
+        xfs_dir_leaf_name_t *namest;
+        xfs_da_args_t args;
+        xfs_inode_t *dp;
+        xfs_ino_t parent;
+        char *tmpbuffer;
+        int retval, i;
+        xfs_dabuf_t *bp;
+        dp = iargs->dp;
+        tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+        ASSERT(tmpbuffer != NULL);
+        retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
+                                               XFS_DATA_FORK);
+        if (retval)
+                goto out;
+        ASSERT(bp != NULL);
+        memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
+        leaf = (xfs_dir_leafblock_t *)tmpbuffer;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        memset(bp->data, 0, XFS_LBSIZE(dp->i_mount));
+        /*
+         * Find and special case the parent inode number
+         */
+        hdr = &leaf->hdr;
+        entry = &leaf->entries[0];
+        for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
+                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+                if ((entry->namelen == 2) &&
+                    (namest->name[0] == '.') &&
+                    (namest->name[1] == '.')) {
+                        XFS_DIR_SF_GET_DIRINO(&namest->inumber, &parent);
+                        entry->nameidx = 0;
+                } else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
+                        entry->nameidx = 0;
+                }
+        }
+        retval = xfs_da_shrink_inode(iargs, 0, bp);
+        if (retval)
+                goto out;
+        retval = xfs_dir_shortform_create(iargs, parent);
+        if (retval)
+                goto out;
+        /*
+         * Copy the rest of the filenames
+         */
+        entry = &leaf->entries[0];
+        args.dp = dp;
+        args.firstblock = iargs->firstblock;
+        args.flist = iargs->flist;
+        args.total = iargs->total;
+        args.whichfork = XFS_DATA_FORK;
+        args.trans = iargs->trans;
+        args.justcheck = 0;
+        args.addname = args.oknoent = 1;
+        for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
+                if (!entry->nameidx)
+                        continue;
+                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+                args.name = (char *)(namest->name);
+                args.namelen = entry->namelen;
+                args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+                XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args.inumber);
+                xfs_dir_shortform_addname(&args);
+        }
+out:
+        kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+        return(retval);
+}
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_dir_leaf_to_node(xfs_da_args_t *args)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_da_intnode_t *node;
+        xfs_inode_t *dp;
+        xfs_dabuf_t *bp1, *bp2;
+        xfs_dablk_t blkno;
+        int retval;
+        dp = args->dp;
+        retval = xfs_da_grow_inode(args, &blkno);
+        ASSERT(blkno == 1);
+        if (retval)
+                return(retval);
+        retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+                                              XFS_DATA_FORK);
+        if (retval)
+                return(retval);
+        ASSERT(bp1 != NULL);
+        retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
+                                             XFS_DATA_FORK);
+        if (retval) {
+                xfs_da_buf_done(bp1);
+                return(retval);
+        }
+        ASSERT(bp2 != NULL);
+        memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount));
+        xfs_da_buf_done(bp1);
+        xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+        /*
+         * Set up the new root node.
+         */
+        retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
+        if (retval) {
+                xfs_da_buf_done(bp2);
+                return(retval);
+        }
+        node = bp1->data;
+        leaf = bp2->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        INT_SET(node->btree[0].hashval, ARCH_CONVERT, INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+        xfs_da_buf_done(bp2);
+        INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
+        INT_SET(node->hdr.count, ARCH_CONVERT, 1);
+        xfs_da_log_buf(args->trans, bp1,
+                XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
+        xfs_da_buf_done(bp1);
+        return(retval);
+}
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+/*
+ * Create the initial contents of a leaf directory
+ * or a leaf in a node directory.
+ */
+int
+xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_hdr_t *hdr;
+        xfs_inode_t *dp;
+        xfs_dabuf_t *bp;
+        int retval;
+        dp = args->dp;
+        ASSERT(dp != NULL);
+        retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
+        if (retval)
+                return(retval);
+        ASSERT(bp != NULL);
+        leaf = bp->data;
+        memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
+        hdr = &leaf->hdr;
+        INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_DIR_LEAF_MAGIC);
+        INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
+        if (!hdr->firstused)
+                INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
+        INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
+        INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
+        xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+        *bpp = bp;
+        return(0);
+}
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+                                  xfs_da_state_blk_t *newblk)
+{
+        xfs_dablk_t blkno;
+        xfs_da_args_t *args;
+        int error;
+        /*
+         * Allocate space for a new leaf node.
+         */
+        args = state->args;
+        ASSERT(args != NULL);
+        ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
+        error = xfs_da_grow_inode(args, &blkno);
+        if (error)
+                return(error);
+        error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
+        if (error)
+                return(error);
+        newblk->blkno = blkno;
+        newblk->magic = XFS_DIR_LEAF_MAGIC;
+        /*
+         * Rebalance the entries across the two leaves.
+         */
+        xfs_dir_leaf_rebalance(state, oldblk, newblk);
+        error = xfs_da_blk_link(state, oldblk, newblk);
+        if (error)
+                return(error);
+        /*
+         * Insert the new entry in the correct block.
+         */
+        if (state->inleaf) {
+                error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
+        } else {
+                error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
+        }
+        /*
+         * Update last hashval in each block since we added the name.
+         */
+        oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
+        newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
+        return(error);
+}
+/*
+ * Add a name to the leaf directory structure.
+ *
+ * Must take into account fragmented leaves and leaves where spacemap has
+ * lost some freespace information (ie: holes).
+ */
+int
+xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_hdr_t *hdr;
+        xfs_dir_leaf_map_t *map;
+        int tablesize, entsize, sum, i, tmp, error;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+        hdr = &leaf->hdr;
+        entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
+        /*
+         * Search through freemap for first-fit on new name length.
+         * (may need to figure in size of entry struct too)
+         */
+        tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
+                        + (uint)sizeof(xfs_dir_leaf_hdr_t);
+        map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
+        for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+                if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
+                        sum += INT_GET(map->size, ARCH_CONVERT);
+                        continue;
+                }
+                if (!map->size)
+                        continue;       /* no space in this map */
+                tmp = entsize;
+                if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
+                        tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
+                if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
+                        if (!args->justcheck)
+                                xfs_dir_leaf_add_work(bp, args, index, i);
+                        return(0);
+                }
+                sum += INT_GET(map->size, ARCH_CONVERT);
+        }
+        /*
+         * If there are no holes in the address space of the block,
+         * and we don't have enough freespace, then compaction will do us
+         * no good and we should just give up.
+         */
+        if (!hdr->holes && (sum < entsize))
+                return(XFS_ERROR(ENOSPC));
+        /*
+         * Compact the entries to coalesce free space.
+         * Pass the justcheck flag so the checking pass can return
+         * an error, without changing anything, if it won't fit.
+         */
+        error = xfs_dir_leaf_compact(args->trans, bp,
+                        args->total == 0 ?
+                                entsize +
+                                (uint)sizeof(xfs_dir_leaf_entry_t) : 0,
+                        args->justcheck);
+        if (error)
+                return(error);
+        /*
+         * After compaction, the block is guaranteed to have only one
+         * free region, in freemap[0].  If it is not big enough, give up.
+         */
+        if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
+            (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
+                return(XFS_ERROR(ENOSPC));
+        if (!args->justcheck)
+                xfs_dir_leaf_add_work(bp, args, index, 0);
+        return(0);
+}
+/*
+ * Add a name to a leaf directory structure.
+ */
+STATIC void
+xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
+                      int mapindex)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_hdr_t *hdr;
+        xfs_dir_leaf_entry_t *entry;
+        xfs_dir_leaf_name_t *namest;
+        xfs_dir_leaf_map_t *map;
+        /* REFERENCED */
+        xfs_mount_t *mp;
+        int tmp, i;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        hdr = &leaf->hdr;
+        ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
+        ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
+        /*
+         * Force open some space in the entry array and fill it in.
+         */
+        entry = &leaf->entries[index];
+        if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
+                tmp  = INT_GET(hdr->count, ARCH_CONVERT) - index;
+                tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+                memmove(entry + 1, entry, tmp);
+                xfs_da_log_buf(args->trans, bp,
+                    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
+        }
+        INT_MOD(hdr->count, ARCH_CONVERT, +1);
+        /*
+         * Allocate space for the new string (at the end of the run).
+         */
+        map = &hdr->freemap[mapindex];
+        mp = args->trans->t_mountp;
+        ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+        ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
+        ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+        INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
+        INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
+        INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+        entry->namelen = args->namelen;
+        xfs_da_log_buf(args->trans, bp,
+            XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+        /*
+         * Copy the string and inode number into the new space.
+         */
+        namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+        XFS_DIR_SF_PUT_DIRINO(&args->inumber, &namest->inumber);
+        memcpy(namest->name, args->name, args->namelen);
+        xfs_da_log_buf(args->trans, bp,
+            XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
+        /*
+         * Update the control info for this leaf node
+         */
+        if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
+                INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
+        ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
+        tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
+                        + (uint)sizeof(xfs_dir_leaf_hdr_t);
+        map = &hdr->freemap[0];
+        for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
+                if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
+                        INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
+                        INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
+                }
+        }
+        INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
+        xfs_da_log_buf(args->trans, bp,
+                XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+}
+/*
+ * Garbage collect a leaf directory block by copying it to a new buffer.
+ */
+STATIC int
+xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
+                     int justcheck)
+{
+        xfs_dir_leafblock_t *leaf_s, *leaf_d;
+        xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
+        xfs_mount_t *mp;
+        char *tmpbuffer;
+        char *tmpbuffer2=NULL;
+        int rval;
+        int lbsize;
+        mp = trans->t_mountp;
+        lbsize = XFS_LBSIZE(mp);
+        tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
+        ASSERT(tmpbuffer != NULL);
+        memcpy(tmpbuffer, bp->data, lbsize);
+        /*
+         * Make a second copy in case xfs_dir_leaf_moveents()
+         * below destroys the original.
+         */
+        if (musthave || justcheck) {
+                tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
+                memcpy(tmpbuffer2, bp->data, lbsize);
+        }
+        memset(bp->data, 0, lbsize);
+        /*
+         * Copy basic information
+         */
+        leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
+        leaf_d = bp->data;
+        hdr_s = &leaf_s->hdr;
+        hdr_d = &leaf_d->hdr;
+        hdr_d->info = hdr_s->info;      /* struct copy */
+        INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
+        if (!hdr_d->firstused)
+                INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
+        hdr_d->namebytes = 0;
+        hdr_d->count = 0;
+        hdr_d->holes = 0;
+        INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
+        INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+        /*
+         * Copy all entry's in the same (sorted) order,
+         * but allocate filenames packed and in sequence.
+         * This changes the source (leaf_s) as well.
+         */
+        xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+        if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
+                rval = XFS_ERROR(ENOSPC);
+        else
+                rval = 0;
+        if (justcheck || rval == ENOSPC) {
+                ASSERT(tmpbuffer2);
+                memcpy(bp->data, tmpbuffer2, lbsize);
+        } else {
+                xfs_da_log_buf(trans, bp, 0, lbsize - 1);
+        }
+        kmem_free(tmpbuffer, lbsize);
+        if (musthave || justcheck)
+                kmem_free(tmpbuffer2, lbsize);
+        return(rval);
+}
+/*
+ * Redistribute the directory entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of old block.
+ */
+STATIC void
+xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+                                      xfs_da_state_blk_t *blk2)
+{
+        xfs_da_state_blk_t *tmp_blk;
+        xfs_dir_leafblock_t *leaf1, *leaf2;
+        xfs_dir_leaf_hdr_t *hdr1, *hdr2;
+        int count, totallen, max, space, swap;
+        /*
+         * Set up environment.
+         */
+        ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
+        ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
+        leaf1 = blk1->bp->data;
+        leaf2 = blk2->bp->data;
+        ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        /*
+         * Check ordering of blocks, reverse if it makes things simpler.
+         */
+        swap = 0;
+        if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
+                tmp_blk = blk1;
+                blk1 = blk2;
+                blk2 = tmp_blk;
+                leaf1 = blk1->bp->data;
+                leaf2 = blk2->bp->data;
+                swap = 1;
+        }
+        hdr1 = &leaf1->hdr;
+        hdr2 = &leaf2->hdr;
+        /*
+         * Examine entries until we reduce the absolute difference in
+         * byte usage between the two blocks to a minimum.  Then get
+         * the direction to copy and the number of elements to move.
+         */
+        state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
+                                                           &count, &totallen);
+        if (swap)
+                state->inleaf = !state->inleaf;
+        /*
+         * Move any entries required from leaf to leaf:
+         */
+        if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+                /*
+                 * Figure the total bytes to be added to the destination leaf.
+                 */
+                count = INT_GET(hdr1->count, ARCH_CONVERT) - count;     /* number entries being moved */
+                space  = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
+                space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
+                space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
+                /*
+                 * leaf2 is the destination, compact it if it looks tight.
+                 */
+                max  = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+                max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+                if (space > max) {
+                        xfs_dir_leaf_compact(state->args->trans, blk2->bp,
+                                                                 0, 0);
+                }
+                /*
+                 * Move high entries from leaf1 to low end of leaf2.
+                 */
+                xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
+                                             leaf2, 0, count, state->mp);
+                xfs_da_log_buf(state->args->trans, blk1->bp, 0,
+                                                   state->blocksize-1);
+                xfs_da_log_buf(state->args->trans, blk2->bp, 0,
+                                                   state->blocksize-1);
+        } else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+                /*
+                 * Figure the total bytes to be added to the destination leaf.
+                 */
+                count -= INT_GET(hdr1->count, ARCH_CONVERT);            /* number entries being moved */
+                space  = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
+                space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
+                space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
+                /*
+                 * leaf1 is the destination, compact it if it looks tight.
+                 */
+                max  = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+                max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+                if (space > max) {
+                        xfs_dir_leaf_compact(state->args->trans, blk1->bp,
+                                                                 0, 0);
+                }
+                /*
+                 * Move low entries from leaf2 to high end of leaf1.
+                 */
+                xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
+                                             count, state->mp);
+                xfs_da_log_buf(state->args->trans, blk1->bp, 0,
+                                                   state->blocksize-1);
+                xfs_da_log_buf(state->args->trans, blk2->bp, 0,
+                                                   state->blocksize-1);
+        }
+        /*
+         * Copy out last hashval in each block for B-tree code.
+         */
+        blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+        blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+        /*
+         * Adjust the expected index for insertion.
+         * GROT: this doesn't work unless blk2 was originally empty.
+         */
+        if (!state->inleaf) {
+                blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+        }
+}
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
+                                           xfs_da_state_blk_t *blk1,
+                                           xfs_da_state_blk_t *blk2,
+                                           int *countarg, int *namebytesarg)
+{
+        xfs_dir_leafblock_t *leaf1, *leaf2;
+        xfs_dir_leaf_hdr_t *hdr1, *hdr2;
+        xfs_dir_leaf_entry_t *entry;
+        int count, max, totallen, half;
+        int lastdelta, foundit, tmp;
+        /*
+         * Set up environment.
+         */
+        leaf1 = blk1->bp->data;
+        leaf2 = blk2->bp->data;
+        hdr1 = &leaf1->hdr;
+        hdr2 = &leaf2->hdr;
+        foundit = 0;
+        totallen = 0;
+        /*
+         * Examine entries until we reduce the absolute difference in
+         * byte usage between the two blocks to a minimum.
+         */
+        max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
+        half  = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
+        half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
+        half /= 2;
+        lastdelta = state->blocksize;
+        entry = &leaf1->entries[0];
+        for (count = 0; count < max; entry++, count++) {
+#define XFS_DIR_ABS(A)  (((A) < 0) ? -(A) : (A))
+                /*
+                 * The new entry is in the first block, account for it.
+                 */
+                if (count == blk1->index) {
+                        tmp = totallen + (uint)sizeof(*entry)
+                                + XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
+                        if (XFS_DIR_ABS(half - tmp) > lastdelta)
+                                break;
+                        lastdelta = XFS_DIR_ABS(half - tmp);
+                        totallen = tmp;
+                        foundit = 1;
+                }
+                /*
+                 * Wrap around into the second block if necessary.
+                 */
+                if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+                        leaf1 = leaf2;
+                        entry = &leaf1->entries[0];
+                }
+                /*
+                 * Figure out if next leaf entry would be too much.
+                 */
+                tmp = totallen + (uint)sizeof(*entry)
+                                + XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+                if (XFS_DIR_ABS(half - tmp) > lastdelta)
+                        break;
+                lastdelta = XFS_DIR_ABS(half - tmp);
+                totallen = tmp;
+#undef XFS_DIR_ABS
+        }
+        /*
+         * Calculate the number of namebytes that will end up in lower block.
+         * If new entry not in lower block, fix up the count.
+         */
+        totallen -=
+                count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
+        if (foundit) {
+                totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
+                            state->args->namelen;
+        }
+        *countarg = count;
+        *namebytesarg = totallen;
+        return(foundit);
+}
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int
+xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_da_state_blk_t *blk;
+        xfs_da_blkinfo_t *info;
+        int count, bytes, forward, error, retval, i;
+        xfs_dablk_t blkno;
+        xfs_dabuf_t *bp;
+        /*
+         * Check for the degenerate case of the block being over 50% full.
+         * If so, it's not worth even looking to see if we might be able
+         * to coalesce with a sibling.
+         */
+        blk = &state->path.blk[ state->path.active-1 ];
+        info = blk->bp->data;
+        ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        leaf = (xfs_dir_leafblock_t *)info;
+        count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+        bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
+                count * (uint)sizeof(xfs_dir_leaf_entry_t) +
+                count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
+                INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+        if (bytes > (state->blocksize >> 1)) {
+                *action = 0;    /* blk over 50%, don't try to join */
+                return(0);
+        }
+        /*
+         * Check for the degenerate case of the block being empty.
+         * If the block is empty, we'll simply delete it, no need to
+         * coalesce it with a sibling block.  We choose (aribtrarily)
+         * to merge with the forward block unless it is NULL.
+         */
+        if (count == 0) {
+                /*
+                 * Make altpath point to the block we want to keep and
+                 * path point to the block we want to drop (this one).
+                 */
+                forward = info->forw;
+                memcpy(&state->altpath, &state->path, sizeof(state->path));
+                error = xfs_da_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+                if (error)
+                        return(error);
+                if (retval) {
+                        *action = 0;
+                } else {
+                        *action = 2;
+                }
+                return(0);
+        }
+        /*
+         * Examine each sibling block to see if we can coalesce with
+         * at least 25% free space to spare.  We need to figure out
+         * whether to merge with the forward or the backward block.
+         * We prefer coalescing with the lower numbered sibling so as
+         * to shrink a directory over time.
+         */
+        forward = (INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT));      /* start with smaller blk num */
+        for (i = 0; i < 2; forward = !forward, i++) {
+                if (forward)
+                        blkno = INT_GET(info->forw, ARCH_CONVERT);
+                else
+                        blkno = INT_GET(info->back, ARCH_CONVERT);
+                if (blkno == 0)
+                        continue;
+                error = xfs_da_read_buf(state->args->trans, state->args->dp,
+                                                            blkno, -1, &bp,
+                                                            XFS_DATA_FORK);
+                if (error)
+                        return(error);
+                ASSERT(bp != NULL);
+                leaf = (xfs_dir_leafblock_t *)info;
+                count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                bytes  = state->blocksize - (state->blocksize>>2);
+                bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+                leaf = bp->data;
+                ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+                count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+                bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
+                bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
+                bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
+                if (bytes >= 0)
+                        break;  /* fits with at least 25% to spare */
+                xfs_da_brelse(state->args->trans, bp);
+        }
+        if (i >= 2) {
+                *action = 0;
+                return(0);
+        }
+        xfs_da_buf_done(bp);
+        /*
+         * Make altpath point to the block we want to keep (the lower
+         * numbered block) and path point to the block we want to drop.
+         */
+        memcpy(&state->altpath, &state->path, sizeof(state->path));
+        if (blkno < blk->blkno) {
+                error = xfs_da_path_shift(state, &state->altpath, forward,
+                                                 0, &retval);
+        } else {
+                error = xfs_da_path_shift(state, &state->path, forward,
+                                                 0, &retval);
+        }
+        if (error)
+                return(error);
+        if (retval) {
+                *action = 0;
+        } else {
+                *action = 1;
+        }
+        return(0);
+}
+/*
+ * Remove a name from the leaf directory structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_hdr_t *hdr;
+        xfs_dir_leaf_map_t *map;
+        xfs_dir_leaf_entry_t *entry;
+        xfs_dir_leaf_name_t *namest;
+        int before, after, smallest, entsize;
+        int tablesize, tmp, i;
+        xfs_mount_t *mp;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        hdr = &leaf->hdr;
+        mp = trans->t_mountp;
+        ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+        ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
+        ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
+        entry = &leaf->entries[index];
+        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+        /*
+         * Scan through free region table:
+         *    check for adjacency of free'd entry with an existing one,
+         *    find smallest free region in case we need to replace it,
+         *    adjust any map that borders the entry table,
+         */
+        tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+                        + (uint)sizeof(xfs_dir_leaf_hdr_t);
+        map = &hdr->freemap[0];
+        tmp = INT_GET(map->size, ARCH_CONVERT);
+        before = after = -1;
+        smallest = XFS_DIR_LEAF_MAPSIZE - 1;
+        entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+        for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
+                ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+                ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+                if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
+                        INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
+                        INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
+                }
+                if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
+                        before = i;
+                } else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+                        after = i;
+                } else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
+                        tmp = INT_GET(map->size, ARCH_CONVERT);
+                        smallest = i;
+                }
+        }
+        /*
+         * Coalesce adjacent freemap regions,
+         * or replace the smallest region.
+         */
+        if ((before >= 0) || (after >= 0)) {
+                if ((before >= 0) && (after >= 0)) {
+                        map = &hdr->freemap[before];
+                        INT_MOD(map->size, ARCH_CONVERT, entsize);
+                        INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
+                        hdr->freemap[after].base = 0;
+                        hdr->freemap[after].size = 0;
+                } else if (before >= 0) {
+                        map = &hdr->freemap[before];
+                        INT_MOD(map->size, ARCH_CONVERT, entsize);
+                } else {
+                        map = &hdr->freemap[after];
+                        INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
+                        INT_MOD(map->size, ARCH_CONVERT, entsize);
+                }
+        } else {
+                /*
+                 * Replace smallest region (if it is smaller than free'd entry)
+                 */
+                map = &hdr->freemap[smallest];
+                if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
+                        INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
+                        INT_SET(map->size, ARCH_CONVERT, entsize);
+                }
+        }
+        /*
+         * Did we remove the first entry?
+         */
+        if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
+                smallest = 1;
+        else
+                smallest = 0;
+        /*
+         * Compress the remaining entries and zero out the removed stuff.
+         */
+        namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+        memset((char *)namest, 0, entsize);
+        xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
+        INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
+        tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
+        memmove(entry, entry + 1, tmp);
+        INT_MOD(hdr->count, ARCH_CONVERT, -1);
+        xfs_da_log_buf(trans, bp,
+            XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
+        entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
+        memset((char *)entry, 0, sizeof(xfs_dir_leaf_entry_t));
+        /*
+         * If we removed the first entry, re-find the first used byte
+         * in the name area.  Note that if the entry was the "firstused",
+         * then we don't have a "hole" in our block resulting from
+         * removing the name.
+         */
+        if (smallest) {
+                tmp = XFS_LBSIZE(mp);
+                entry = &leaf->entries[0];
+                for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
+                        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+                        ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+                        if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
+                                tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
+                }
+                INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
+                if (!hdr->firstused)
+                        INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
+        } else {
+                hdr->holes = 1;         /* mark as needing compaction */
+        }
+        xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+        /*
+         * Check if leaf is less than 50% full, caller may want to
+         * "join" the leaf with a sibling if so.
+         */
+        tmp  = (uint)sizeof(xfs_dir_leaf_hdr_t);
+        tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+        tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
+        tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+        if (tmp < mp->m_dir_magicpct)
+                return(1);                      /* leaf is < 37% full */
+        return(0);
+}
+/*
+ * Move all the directory entries from drop_leaf into save_leaf.
+ */
+void
+xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+                                      xfs_da_state_blk_t *save_blk)
+{
+        xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+        xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+        xfs_mount_t *mp;
+        char *tmpbuffer;
+        /*
+         * Set up environment.
+         */
+        mp = state->mp;
+        ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
+        ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
+        drop_leaf = drop_blk->bp->data;
+        save_leaf = save_blk->bp->data;
+        ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        drop_hdr = &drop_leaf->hdr;
+        save_hdr = &save_leaf->hdr;
+        /*
+         * Save last hashval from dying block for later Btree fixup.
+         */
+        drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
+        /*
+         * Check if we need a temp buffer, or can we do it in place.
+         * Note that we don't check "leaf" for holes because we will
+         * always be dropping it, toosmall() decided that for us already.
+         */
+        if (save_hdr->holes == 0) {
+                /*
+                 * dest leaf has no holes, so we add there.  May need
+                 * to make some room in the entry array.
+                 */
+                if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
+                        xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+                                                 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+                } else {
+                        xfs_dir_leaf_moveents(drop_leaf, 0,
+                                              save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
+                                              (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+                }
+        } else {
+                /*
+                 * Destination has holes, so we make a temporary copy
+                 * of the leaf and add them both to that.
+                 */
+                tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+                ASSERT(tmpbuffer != NULL);
+                memset(tmpbuffer, 0, state->blocksize);
+                tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
+                tmp_hdr = &tmp_leaf->hdr;
+                tmp_hdr->info = save_hdr->info; /* struct copy */
+                tmp_hdr->count = 0;
+                INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
+                if (!tmp_hdr->firstused)
+                        INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
+                tmp_hdr->namebytes = 0;
+                if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
+                        xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+                                                 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+                        xfs_dir_leaf_moveents(save_leaf, 0,
+                                              tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+                                              (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+                } else {
+                        xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+                                                 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+                        xfs_dir_leaf_moveents(drop_leaf, 0,
+                                              tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+                                              (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+                }
+                memcpy(save_leaf, tmp_leaf, state->blocksize);
+                kmem_free(tmpbuffer, state->blocksize);
+        }
+        xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+                                           state->blocksize - 1);
+        /*
+         * Copy out last hashval in each block for B-tree code.
+         */
+        save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+/*
+ * Look up a name in a leaf directory structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in *index the index into the entry[] array of either the found
+ * entry, or where the entry should have been (insert before that entry).
+ *
+ * Don't change the args->inumber unless we find the filename.
+ */
+int
+xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
+{
+        xfs_dir_leafblock_t *leaf;
+        xfs_dir_leaf_entry_t *entry;
+        xfs_dir_leaf_name_t *namest;
+        int probe, span;
+        xfs_dahash_t hashval;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
+        /*
+         * Binary search.  (note: small blocks will skip this loop)
+         */
+        hashval = args->hashval;
+        probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
+        for (entry = &leaf->entries[probe]; span > 4;
+                   entry = &leaf->entries[probe]) {
+                span /= 2;
+                if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+                        probe += span;
+                else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+                        probe -= span;
+                else
+                        break;
+        }
+        ASSERT((probe >= 0) && \
+               ((!leaf->hdr.count) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
+        ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
+        /*
+         * Since we may have duplicate hashval's, find the first matching
+         * hashval in the leaf.
+         */
+        while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
+                entry--;
+                probe--;
+        }
+        while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+                entry++;
+                probe++;
+        }
+        if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+                *index = probe;
+                ASSERT(args->oknoent);
+                return(XFS_ERROR(ENOENT));
+        }
+        /*
+         * Duplicate keys may be present, so search all of them for a match.
+         */
+        while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
+                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+                if (entry->namelen == args->namelen &&
+                    namest->name[0] == args->name[0] &&
+                    memcmp(args->name, namest->name, args->namelen) == 0) {
+                        XFS_DIR_SF_GET_DIRINO(&namest->inumber, &args->inumber);
+                        *index = probe;
+                        return(XFS_ERROR(EEXIST));
+                }
+                entry++;
+                probe++;
+        }
+        *index = probe;
+        ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
+        return(XFS_ERROR(ENOENT));
+}
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
+                      xfs_dir_leafblock_t *leaf_d, int start_d,
+                      int count, xfs_mount_t *mp)
+{
+        xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
+        xfs_dir_leaf_entry_t *entry_s, *entry_d;
+        int tmp, i;
+        /*
+         * Check for nothing to do.
+         */
+        if (count == 0)
+                return;
+        /*
+         * Set up environment.
+         */
+        ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        hdr_s = &leaf_s->hdr;
+        hdr_d = &leaf_d->hdr;
+        ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+        ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
+                ((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
+        ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
+        ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
+                ((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
+        ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
+        ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
+        ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+        /*
+         * Move the entries in the destination leaf up to make a hole?
+         */
+        if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
+                tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+                tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+                entry_s = &leaf_d->entries[start_d];
+                entry_d = &leaf_d->entries[start_d + count];
+                memcpy(entry_d, entry_s, tmp);
+        }
+        /*
+         * Copy all entry's in the same (sorted) order,
+         * but allocate filenames packed and in sequence.
+         */
+        entry_s = &leaf_s->entries[start_s];
+        entry_d = &leaf_d->entries[start_d];
+        for (i = 0; i < count; entry_s++, entry_d++, i++) {
+                ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+                tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
+                INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
+                entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
+                INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
+                entry_d->namelen = entry_s->namelen;
+                ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
+                memcpy(XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)),
+                       XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)), tmp);
+                ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
+                memset((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
+                      0, tmp);
+                INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
+                INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
+                INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+                INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
+                tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+                                + (uint)sizeof(xfs_dir_leaf_hdr_t);
+                ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+        }
+        /*
+         * Zero out the entries we just copied.
+         */
+        if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+                tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
+                entry_s = &leaf_s->entries[start_s];
+                ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
+                memset((char *)entry_s, 0, tmp);
+        } else {
+                /*
+                 * Move the remaining entries down to fill the hole,
+                 * then zero the entries at the top.
+                 */
+                tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
+                tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+                entry_s = &leaf_s->entries[start_s + count];
+                entry_d = &leaf_s->entries[start_s];
+                memcpy(entry_d, entry_s, tmp);
+                tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
+                entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
+                ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
+                memset((char *)entry_s, 0, tmp);
+        }
+        /*
+         * Fill in the freemap information
+         */
+        INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
+        INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
+        INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+        INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, (hdr_d->freemap[2].base = 0));
+        INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, (hdr_d->freemap[2].size = 0));
+        hdr_s->holes = 1;       /* leaf may not be compact */
+}
+/*
+ * Compare two leaf blocks "order".
+ */
+int
+xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+        xfs_dir_leafblock_t *leaf1, *leaf2;
+        leaf1 = leaf1_bp->data;
+        leaf2 = leaf2_bp->data;
+        ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) &&
+               (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC));
+        if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
+            ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
+              INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
+             (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+              INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+                return(1);
+        }
+        return(0);
+}
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+        xfs_dir_leafblock_t *leaf;
+        leaf = bp->data;
+        ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+        if (count)
+                *count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+        if (!leaf->hdr.count)
+                return(0);
+        return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+}
+/*
+ * Copy out directory entries for getdents(), for leaf directories.
+ */
+int
+xfs_dir_leaf_getdents_int(
+        xfs_dabuf_t     *bp,
+        xfs_inode_t     *dp,
+        xfs_dablk_t     bno,
+        uio_t           *uio,
+        int             *eobp,
+        xfs_dirent_t    *dbp,
+        xfs_dir_put_t   put,
+        xfs_daddr_t             nextda)
+{
+        xfs_dir_leafblock_t     *leaf;
+        xfs_dir_leaf_entry_t    *entry;
+        xfs_dir_leaf_name_t     *namest;
+        int                     entno, want_entno, i, nextentno;
+        xfs_mount_t             *mp;
+        xfs_dahash_t            cookhash;
+        xfs_dahash_t            nexthash = 0;
+#if (BITS_PER_LONG == 32)
+        xfs_dahash_t            lasthash = XFS_DA_MAXHASH;
+#endif
+        xfs_dir_put_args_t      p;
+        mp = dp->i_mount;
+        leaf = bp->data;
+        if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+                *eobp = 1;
+                return(XFS_ERROR(ENOENT));      /* XXX wrong code */
+        }
+        want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
+        cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+        xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
+        /*
+         * Re-find our place.
+         */
+        for (i = entno = 0, entry = &leaf->entries[0];
+                     i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                             entry++, i++) {
+                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
+                                    INT_GET(entry->nameidx, ARCH_CONVERT));
+                if (unlikely(
+                    ((char *)namest < (char *)leaf) ||
+                    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
+                        XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(1)",
+                                             XFS_ERRLEVEL_LOW, mp, leaf);
+                        xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
+                        if (   entno < want_entno
+                            && INT_GET(entry->hashval, ARCH_CONVERT)
+                                                        == cookhash) {
+                                /*
+                                 * Trying to get to a particular offset in a
+                                 * run of equal-hashval entries.
+                                 */
+                                entno++;
+                        } else if (   want_entno > 0
+                                   && entno == want_entno
+                                   && INT_GET(entry->hashval, ARCH_CONVERT)
+                                                        == cookhash) {
+                                break;
+                        } else {
+                                entno = 0;
+                                break;
+                        }
+                }
+        }
+        if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+                xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
+                if (!INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))
+                        uio->uio_offset =
+                                XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
+                /*
+                 * Don't set uio_offset if there's another block:
+                 * the node code will be setting uio_offset anyway.
+                 */
+                *eobp = 0;
+                return(0);
+        }
+        xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
+        p.dbp = dbp;
+        p.put = put;
+        p.uio = uio;
+        /*
+         * We're synchronized, start copying entries out to the user.
+         */
+        for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+                             entry++, i++, (entno = nextentno)) {
+                int lastresid=0, retval;
+                xfs_dircook_t lastoffset;
+                xfs_dahash_t thishash;
+                /*
+                 * Check for a damaged directory leaf block and pick up
+                 * the inode number from this entry.
+                 */
+                namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
+                                    INT_GET(entry->nameidx, ARCH_CONVERT));
+                if (unlikely(
+                    ((char *)namest < (char *)leaf) ||
+                    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)))) {
+                        XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(2)",
+                                             XFS_ERRLEVEL_LOW, mp, leaf);
+                        xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                xfs_dir_trace_g_duc("leaf: middle cookie  ",
+                                                   dp, uio, p.cook.o);
+                if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
+                        nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
+                        if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
+                                nextentno = entno + 1;
+                        else
+                                nextentno = 0;
+                        XFS_PUT_COOKIE(p.cook, mp, bno, nextentno, nexthash);
+                        xfs_dir_trace_g_duc("leaf: middle cookie  ",
+                                                   dp, uio, p.cook.o);
+                } else if ((thishash = INT_GET(leaf->hdr.info.forw,
+                                                        ARCH_CONVERT))) {
+                        xfs_dabuf_t *bp2;
+                        xfs_dir_leafblock_t *leaf2;
+                        ASSERT(nextda != -1);
+                        retval = xfs_da_read_buf(dp->i_transp, dp, thishash,
+                                                 nextda, &bp2, XFS_DATA_FORK);
+                        if (retval)
+                                return(retval);
+                        ASSERT(bp2 != NULL);
+                        leaf2 = bp2->data;
+                        if (unlikely(
+                               (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+                                                != XFS_DIR_LEAF_MAGIC)
+                            || (INT_GET(leaf2->hdr.info.back, ARCH_CONVERT)
+                                                != bno))) {     /* GROT */
+                                XFS_CORRUPTION_ERROR("xfs_dir_leaf_getdents_int(3)",
+                                                     XFS_ERRLEVEL_LOW, mp,
+                                                     leaf2);
+                                xfs_da_brelse(dp->i_transp, bp2);
+                                return(XFS_ERROR(EFSCORRUPTED));
+                        }
+                        nexthash = INT_GET(leaf2->entries[0].hashval,
+                                                                ARCH_CONVERT);
+                        nextentno = -1;
+                        XFS_PUT_COOKIE(p.cook, mp, thishash, 0, nexthash);
+                        xfs_da_brelse(dp->i_transp, bp2);
+                        xfs_dir_trace_g_duc("leaf: next blk cookie",
+                                                   dp, uio, p.cook.o);
+                } else {
+                        nextentno = -1;
+                        XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
+                }
+                /*
+                 * Save off the cookie so we can fall back should the
+                 * 'put' into the outgoing buffer fails.  To handle a run
+                 * of equal-hashvals, the off_t structure on 64bit
+                 * builds has entno built into the cookie to ID the
+                 * entry.  On 32bit builds, we only have space for the
+                 * hashval so we can't ID specific entries within a group
+                 * of same hashval entries.   For this, lastoffset is set
+                 * to the first in the run of equal hashvals so we don't
+                 * include any entries unless we can include all entries
+                 * that share the same hashval.  Hopefully the buffer
+                 * provided is big enough to handle it (see pv763517).
+                 */
+#if (BITS_PER_LONG == 32)
+                if ((thishash = INT_GET(entry->hashval, ARCH_CONVERT))
+                                                                != lasthash) {
+                        XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
+                        lastresid = uio->uio_resid;
+                        lasthash = thishash;
+                } else {
+                        xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
+                                                   dp, uio, p.cook.o);
+                }
+#else
+                thishash = INT_GET(entry->hashval, ARCH_CONVERT);
+                XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
+                lastresid = uio->uio_resid;
+#endif /* BITS_PER_LONG == 32 */
+                /*
+                 * Put the current entry into the outgoing buffer.  If we fail
+                 * then restore the UIO to the first entry in the current
+                 * run of equal-hashval entries (probably one 1 entry long).
+                 */
+                p.ino = XFS_GET_DIR_INO8(namest->inumber);
+#if XFS_BIG_INUMS
+                p.ino += mp->m_inoadd;
+#endif
+                p.name = (char *)namest->name;
+                p.namelen = entry->namelen;
+                retval = p.put(&p);
+                if (!p.done) {
+                        uio->uio_offset = lastoffset.o;
+                        uio->uio_resid = lastresid;
+                        *eobp = 1;
+                        xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
+                        return(retval);
+                }
+        }
+        uio->uio_offset = p.cook.o;
+        *eobp = 0;
+        xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
+        return(0);
+}
+/*
+ * Format a dirent64 structure and copy it out the the user's buffer.
+ */
+int
+xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
+{
+        iovec_t *iovp;
+        int reclen, namelen;
+        xfs_dirent_t *idbp;
+        uio_t *uio;
+        namelen = pa->namelen;
+        reclen = DIRENTSIZE(namelen);
+        uio = pa->uio;
+        if (reclen > uio->uio_resid) {
+                pa->done = 0;
+                return 0;
+        }
+        iovp = uio->uio_iov;
+        idbp = (xfs_dirent_t *)iovp->iov_base;
+        iovp->iov_base = (char *)idbp + reclen;
+        iovp->iov_len -= reclen;
+        uio->uio_resid -= reclen;
+        idbp->d_reclen = reclen;
+        idbp->d_ino = pa->ino;
+        idbp->d_off = pa->cook.o;
+        idbp->d_name[namelen] = '\0';
+        pa->done = 1;
+        memcpy(idbp->d_name, pa->name, namelen);
+        return 0;
+}
+/*
+ * Format a dirent64 structure and copy it out the the user's buffer.
+ */
+int
+xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
+{
+        int             retval, reclen, namelen;
+        xfs_dirent_t    *idbp;
+        uio_t           *uio;
+        namelen = pa->namelen;
+        reclen = DIRENTSIZE(namelen);
+        uio = pa->uio;
+        if (reclen > uio->uio_resid) {
+                pa->done = 0;
+                return 0;
+        }
+        idbp = pa->dbp;
+        idbp->d_reclen = reclen;
+        idbp->d_ino = pa->ino;
+        idbp->d_off = pa->cook.o;
+        idbp->d_name[namelen] = '\0';
+        memcpy(idbp->d_name, pa->name, namelen);
+        retval = uio_read((caddr_t)idbp, reclen, uio);
+        pa->done = (retval == 0);
+        return retval;
+}
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
new file mode 100644
index 000000000000..00d68d33cc7a
--- /dev/null
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_LEAF_H__
+#define __XFS_DIR_LEAF_H__
+/*
+ * Directory layout, internal structure, access macros, etc.
+ *
+ * Large directories are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Filenames are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of a filename may not be unique, we may have duplicate keys.  The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+struct uio;
+struct xfs_bmap_free;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_dir_put_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+/*========================================================================
+ * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top.  Names grow from the bottom
+ * but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the namelist area and try again.  If we
+ * still don't have enough space, then we have to split the block.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual string.  The root and intermediate node search always takes
+ * the first-in-the-block key match found, so we should only have to work
+ * "forw"ard.  If none matches, continue with the "forw"ard leaf nodes
+ * until the hash key changes or the filename is found.
+ *
+ * The parent directory and the self-pointer are explicitly represented
+ * (ie: there are entries for "." and "..").
+ *
+ * Note that the count being a __uint16_t limits us to something like a
+ * blocksize of 1.3MB in the face of worst case (short) filenames.
+ */
+#define XFS_DIR_LEAF_MAPSIZE    3       /* how many freespace slots */
+typedef struct xfs_dir_leafblock {
+        struct xfs_dir_leaf_hdr {       /* constant-structure header block */
+                xfs_da_blkinfo_t info;  /* block type, links, etc. */
+                __uint16_t count;       /* count of active leaf_entry's */
+                __uint16_t namebytes;   /* num bytes of name strings stored */
+                __uint16_t firstused;   /* first used byte in name area */
+                __uint8_t  holes;       /* != 0 if blk needs compaction */
+                __uint8_t  pad1;
+                struct xfs_dir_leaf_map {/* RLE map of free bytes */
+                        __uint16_t base; /* base of free region */
+                        __uint16_t size; /* run length of free region */
+                } freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */
+        } hdr;
+        struct xfs_dir_leaf_entry {     /* sorted on key, not name */
+                xfs_dahash_t hashval;   /* hash value of name */
+                __uint16_t nameidx;     /* index into buffer of name */
+                __uint8_t namelen;      /* length of name string */
+                __uint8_t pad2;
+        } entries[1];                   /* var sized array */
+        struct xfs_dir_leaf_name {
+                xfs_dir_ino_t inumber;  /* inode number for this key */
+                __uint8_t name[1];      /* name string itself */
+        } namelist[1];                  /* grows from bottom of buf */
+} xfs_dir_leafblock_t;
+typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t;
+typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t;
+typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t;
+typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t;
+/*
+ * Length of name for which a 512-byte block filesystem
+ * can get a double split.
+ */
+#define XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN       \
+        (512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
+         (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
+         (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
+typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
+typedef union {
+        xfs_off_t               o;              /* offset (cookie) */
+        /*
+         * Watch the order here (endian-ness dependent).
+         */
+        struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+                xfs_dahash_t    h;      /* hash value */
+                __uint32_t      be;     /* block and entry */
+#else   /* __BYTE_ORDER == __BIG_ENDIAN */
+                __uint32_t      be;     /* block and entry */
+                xfs_dahash_t    h;      /* hash value */
+#endif  /* __BYTE_ORDER == __BIG_ENDIAN */
+        } s;
+} xfs_dircook_t;
+#define XFS_PUT_COOKIE(c,mp,bno,entry,hash)     \
+        ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
+typedef struct xfs_dir_put_args
+{
+        xfs_dircook_t   cook;           /* cookie of (next) entry */
+        xfs_intino_t    ino;            /* inode number */
+        struct xfs_dirent       *dbp;           /* buffer pointer */
+        char            *name;          /* directory entry name */
+        int             namelen;        /* length of name */
+        int             done;           /* output: set if value was stored */
+        xfs_dir_put_t   put;            /* put function ptr (i/o) */
+        struct uio      *uio;           /* uio control structure */
+} xfs_dir_put_args_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
+int xfs_dir_leaf_entsize_byname(int len);
+#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)        xfs_dir_leaf_entsize_byname(len)
+#else
+#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)        /* space a name will use */ \
+        ((uint)sizeof(xfs_dir_leaf_name_t)-1 + len)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
+int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry);
+#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)     \
+        xfs_dir_leaf_entsize_byentry(entry)
+#else
+#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)     /* space an entry will use */ \
+        ((uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
+xfs_dir_leaf_name_t *
+xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset);
+#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)   \
+        xfs_dir_leaf_namestruct(leafp,offset)
+#else
+#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)   /* point to name struct */ \
+        ((xfs_dir_leaf_name_t *)&((char *)(leafp))[offset])
+#endif
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+/*
+ * Internal routines when dirsize < XFS_LITINO(mp).
+ */
+int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
+int xfs_dir_shortform_addname(struct xfs_da_args *args);
+int xfs_dir_shortform_lookup(struct xfs_da_args *args);
+int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
+int xfs_dir_shortform_removename(struct xfs_da_args *args);
+int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
+                                      struct xfs_dirent *dbp, xfs_dir_put_t put);
+int xfs_dir_shortform_replace(struct xfs_da_args *args);
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+int xfs_dir_leaf_to_node(struct xfs_da_args *args);
+int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
+/*
+ * Routines used for growing the Btree.
+ */
+int     xfs_dir_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
+                                   struct xfs_dabuf **bpp);
+int     xfs_dir_leaf_split(struct xfs_da_state *state,
+                                  struct xfs_da_state_blk *oldblk,
+                                  struct xfs_da_state_blk *newblk);
+int     xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
+                                struct xfs_da_args *args, int insertion_index);
+int     xfs_dir_leaf_addname(struct xfs_da_args *args);
+int     xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
+                                       struct xfs_da_args *args,
+                                       int *index_found_at);
+int     xfs_dir_leaf_remove(struct xfs_trans *trans,
+                                   struct xfs_dabuf *leaf_buffer,
+                                   int index_to_remove);
+int     xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
+                                         xfs_dablk_t bno, struct uio *uio,
+                                         int *eobp, struct xfs_dirent *dbp,
+                                         xfs_dir_put_t put, xfs_daddr_t nextda);
+/*
+ * Routines used for shrinking the Btree.
+ */
+int     xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void    xfs_dir_leaf_unbalance(struct xfs_da_state *state,
+                                             struct xfs_da_state_blk *drop_blk,
+                                             struct xfs_da_state_blk *save_blk);
+/*
+ * Utility routines.
+ */
+uint    xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int     xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
+                                  struct xfs_dabuf *leaf2_bp);
+int     xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
+int     xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
+int     xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+/*
+ * Global data.
+ */
+extern xfs_dahash_t     xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+#endif /* __XFS_DIR_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
new file mode 100644
index 000000000000..a61bcfc2a87d
--- /dev/null
+++ b/fs/xfs/xfs_dir_sf.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_SF_H__
+#define __XFS_DIR_SF_H__
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tight as possible.  The header
+ * and the elements much be memcpy'd out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir_shortform {
+        struct xfs_dir_sf_hdr {         /* constant-structure header block */
+                xfs_dir_ino_t parent;   /* parent dir inode number */
+                __uint8_t count;        /* count of active entries */
+        } hdr;
+        struct xfs_dir_sf_entry {
+                xfs_dir_ino_t inumber;  /* referenced inode number */
+                __uint8_t namelen;      /* actual length of name (no NULL) */
+                __uint8_t name[1];      /* name */
+        } list[1];                      /* variable sized array */
+} xfs_dir_shortform_t;
+typedef struct xfs_dir_sf_hdr xfs_dir_sf_hdr_t;
+typedef struct xfs_dir_sf_entry xfs_dir_sf_entry_t;
+/*
+ * We generate this then sort it, so that readdirs are returned in
+ * hash-order.  Else seekdir won't work.
+ */
+typedef struct xfs_dir_sf_sort {
+        __uint8_t       entno;          /* .=0, ..=1, else entry# + 2 */
+        __uint8_t       seqno;          /* sequence # with same hash value */
+        __uint8_t       namelen;        /* length of name value (no null) */
+        xfs_dahash_t    hash;           /* this entry's hash value */
+        xfs_intino_t    ino;            /* this entry's inode number */
+        char            *name;          /* name value, pointer into buffer */
+} xfs_dir_sf_sort_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_GET_DIRINO)
+void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to);
+#define XFS_DIR_SF_GET_DIRINO(from,to)              xfs_dir_sf_get_dirino(from, to)
+#else
+#define XFS_DIR_SF_GET_DIRINO(from,to)              (*(to) = XFS_GET_DIR_INO8(*from))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_PUT_DIRINO)
+void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to);
+#define XFS_DIR_SF_PUT_DIRINO(from,to)    xfs_dir_sf_put_dirino(from, to)
+#else
+#define XFS_DIR_SF_PUT_DIRINO(from,to)    XFS_PUT_DIR_INO8(*(from), *(to))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
+int xfs_dir_sf_entsize_byname(int len);
+#define XFS_DIR_SF_ENTSIZE_BYNAME(len)          xfs_dir_sf_entsize_byname(len)
+#else
+#define XFS_DIR_SF_ENTSIZE_BYNAME(len)          /* space a name uses */ \
+        ((uint)sizeof(xfs_dir_sf_entry_t)-1 + (len))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
+int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep);
+#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)        xfs_dir_sf_entsize_byentry(sfep)
+#else
+#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)        /* space an entry uses */ \
+        ((uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_NEXTENTRY)
+xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep);
+#define XFS_DIR_SF_NEXTENTRY(sfep)              xfs_dir_sf_nextentry(sfep)
+#else
+#define XFS_DIR_SF_NEXTENTRY(sfep)              /* next entry in struct */ \
+        ((xfs_dir_sf_entry_t *) \
+                ((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ALLFIT)
+int xfs_dir_sf_allfit(int count, int totallen);
+#define XFS_DIR_SF_ALLFIT(count,totallen)       \
+        xfs_dir_sf_allfit(count,totallen)
+#else
+#define XFS_DIR_SF_ALLFIT(count,totallen)       /* will all entries fit? */ \
+        ((uint)sizeof(xfs_dir_sf_hdr_t) + \
+               ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen))
+#endif
+#if defined(XFS_DIR_TRACE)
+/*
+ * Kernel tracing support for directories.
+ */
+struct uio;
+struct xfs_inode;
+struct xfs_da_intnode;
+struct xfs_dinode;
+struct xfs_dir_leafblock;
+struct xfs_dir_leaf_entry;
+#define XFS_DIR_TRACE_SIZE      4096    /* size of global trace buffer */
+extern ktrace_t *xfs_dir_trace_buf;
+/*
+ * Trace record types.
+ */
+#define XFS_DIR_KTRACE_G_DU     1       /* dp, uio */
+#define XFS_DIR_KTRACE_G_DUB    2       /* dp, uio, bno */
+#define XFS_DIR_KTRACE_G_DUN    3       /* dp, uio, node */
+#define XFS_DIR_KTRACE_G_DUL    4       /* dp, uio, leaf */
+#define XFS_DIR_KTRACE_G_DUE    5       /* dp, uio, leaf entry */
+#define XFS_DIR_KTRACE_G_DUC    6       /* dp, uio, cookie */
+void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
+void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
+                              xfs_dablk_t bno);
+void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
+                              struct xfs_da_intnode *node);
+void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
+                              struct xfs_dir_leafblock *leaf);
+void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
+                              struct xfs_dir_leaf_entry *entry);
+void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
+                              xfs_off_t cookie);
+void xfs_dir_trace_enter(int type, char *where,
+                             void *a0, void *a1, void *a2, void *a3,
+                             void *a4, void *a5, void *a6, void *a7,
+                             void *a8, void *a9, void *a10, void *a11);
+#else
+#define xfs_dir_trace_g_du(w,d,u)
+#define xfs_dir_trace_g_dub(w,d,u,b)
+#define xfs_dir_trace_g_dun(w,d,u,n)
+#define xfs_dir_trace_g_dul(w,d,u,l)
+#define xfs_dir_trace_g_due(w,d,u,e)
+#define xfs_dir_trace_g_duc(w,d,u,c)
+#endif /* DEBUG */
+#endif  /* __XFS_DIR_SF_H__ */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
new file mode 100644
index 000000000000..55ae3e67d245
--- /dev/null
+++ b/fs/xfs/xfs_dmapi.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DMAPI_H__
+#define __XFS_DMAPI_H__
+/*      Values used to define the on-disk version of dm_attrname_t. All
+ *      on-disk attribute names start with the 8-byte string "SGI_DMI_".
+ *
+ *      In the on-disk inode, DMAPI attribute names consist of the user-provided
+ *      name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
+ *      changed.
+ */
+#define DMATTR_PREFIXLEN        8
+#define DMATTR_PREFIXSTRING     "SGI_DMI_"
+typedef enum {
+        DM_EVENT_INVALID        = -1,
+        DM_EVENT_CANCEL         = 0,            /* not supported */
+        DM_EVENT_MOUNT          = 1,
+        DM_EVENT_PREUNMOUNT     = 2,
+        DM_EVENT_UNMOUNT        = 3,
+        DM_EVENT_DEBUT          = 4,            /* not supported */
+        DM_EVENT_CREATE         = 5,
+        DM_EVENT_CLOSE          = 6,            /* not supported */
+        DM_EVENT_POSTCREATE     = 7,
+        DM_EVENT_REMOVE         = 8,
+        DM_EVENT_POSTREMOVE     = 9,
+        DM_EVENT_RENAME         = 10,
+        DM_EVENT_POSTRENAME     = 11,
+        DM_EVENT_LINK           = 12,
+        DM_EVENT_POSTLINK       = 13,
+        DM_EVENT_SYMLINK        = 14,
+        DM_EVENT_POSTSYMLINK    = 15,
+        DM_EVENT_READ           = 16,
+        DM_EVENT_WRITE          = 17,
+        DM_EVENT_TRUNCATE       = 18,
+        DM_EVENT_ATTRIBUTE      = 19,
+        DM_EVENT_DESTROY        = 20,
+        DM_EVENT_NOSPACE        = 21,
+        DM_EVENT_USER           = 22,
+        DM_EVENT_MAX            = 23
+} dm_eventtype_t;
+#define HAVE_DM_EVENTTYPE_T
+typedef enum {
+        DM_RIGHT_NULL,
+        DM_RIGHT_SHARED,
+        DM_RIGHT_EXCL
+} dm_right_t;
+#define HAVE_DM_RIGHT_T
+/* Defines for determining if an event message should be sent. */
+#define DM_EVENT_ENABLED(vfsp, ip, event) ( \
+        unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
+                ( ((ip)->i_d.di_dmevmask & (1 << event)) || \
+                  ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
+        )
+#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \
+        unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
+                ( ((io)->io_dmevmask & (1 << event)) || \
+                  ((io)->io_mount->m_dmevmask & (1 << event)) ) \
+        )
+#define DM_XFS_VALID_FS_EVENTS          ( \
+        (1 << DM_EVENT_PREUNMOUNT)      | \
+        (1 << DM_EVENT_UNMOUNT)         | \
+        (1 << DM_EVENT_NOSPACE)         | \
+        (1 << DM_EVENT_DEBUT)           | \
+        (1 << DM_EVENT_CREATE)          | \
+        (1 << DM_EVENT_POSTCREATE)      | \
+        (1 << DM_EVENT_REMOVE)          | \
+        (1 << DM_EVENT_POSTREMOVE)      | \
+        (1 << DM_EVENT_RENAME)          | \
+        (1 << DM_EVENT_POSTRENAME)      | \
+        (1 << DM_EVENT_LINK)            | \
+        (1 << DM_EVENT_POSTLINK)        | \
+        (1 << DM_EVENT_SYMLINK)         | \
+        (1 << DM_EVENT_POSTSYMLINK)     | \
+        (1 << DM_EVENT_ATTRIBUTE)       | \
+        (1 << DM_EVENT_DESTROY)         )
+/* Events valid in dm_set_eventlist() when called with a file handle for
+   a regular file or a symlink.  These events are persistent.
+*/
+#define DM_XFS_VALID_FILE_EVENTS        ( \
+        (1 << DM_EVENT_ATTRIBUTE)       | \
+        (1 << DM_EVENT_DESTROY)         )
+/* Events valid in dm_set_eventlist() when called with a file handle for
+   a directory.  These events are persistent.
+*/
+#define DM_XFS_VALID_DIRECTORY_EVENTS   ( \
+        (1 << DM_EVENT_CREATE)          | \
+        (1 << DM_EVENT_POSTCREATE)      | \
+        (1 << DM_EVENT_REMOVE)          | \
+        (1 << DM_EVENT_POSTREMOVE)      | \
+        (1 << DM_EVENT_RENAME)          | \
+        (1 << DM_EVENT_POSTRENAME)      | \
+        (1 << DM_EVENT_LINK)            | \
+        (1 << DM_EVENT_POSTLINK)        | \
+        (1 << DM_EVENT_SYMLINK)         | \
+        (1 << DM_EVENT_POSTSYMLINK)     | \
+        (1 << DM_EVENT_ATTRIBUTE)       | \
+        (1 << DM_EVENT_DESTROY)         )
+/* Events supported by the XFS filesystem. */
+#define DM_XFS_SUPPORTED_EVENTS         ( \
+        (1 << DM_EVENT_MOUNT)           | \
+        (1 << DM_EVENT_PREUNMOUNT)      | \
+        (1 << DM_EVENT_UNMOUNT)         | \
+        (1 << DM_EVENT_NOSPACE)         | \
+        (1 << DM_EVENT_CREATE)          | \
+        (1 << DM_EVENT_POSTCREATE)      | \
+        (1 << DM_EVENT_REMOVE)          | \
+        (1 << DM_EVENT_POSTREMOVE)      | \
+        (1 << DM_EVENT_RENAME)          | \
+        (1 << DM_EVENT_POSTRENAME)      | \
+        (1 << DM_EVENT_LINK)            | \
+        (1 << DM_EVENT_POSTLINK)        | \
+        (1 << DM_EVENT_SYMLINK)         | \
+        (1 << DM_EVENT_POSTSYMLINK)     | \
+        (1 << DM_EVENT_READ)            | \
+        (1 << DM_EVENT_WRITE)           | \
+        (1 << DM_EVENT_TRUNCATE)        | \
+        (1 << DM_EVENT_ATTRIBUTE)       | \
+        (1 << DM_EVENT_DESTROY)         )
+/*
+ *      Definitions used for the flags field on dm_send_*_event().
+ */
+#define DM_FLAGS_NDELAY         0x001   /* return EAGAIN after dm_pending() */
+#define DM_FLAGS_UNWANTED       0x002   /* event not in fsys dm_eventset_t */
+#define DM_FLAGS_ISEM           0x004   /* thread holds i_sem */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,21)
+/* i_alloc_sem was added in 2.4.22-pre1 */
+#define DM_FLAGS_IALLOCSEM_RD   0x010   /* thread holds i_alloc_sem rd */
+#define DM_FLAGS_IALLOCSEM_WR   0x020   /* thread holds i_alloc_sem wr */
+#endif
+#endif
+/*
+ *      Based on IO_ISDIRECT, decide which i_ flag is set.
+ */
+#ifdef DM_FLAGS_IALLOCSEM_RD
+#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
+                              DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM)
+#define DM_SEM_FLAG_WR  (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM)
+#else
+#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \
+                              0 : DM_FLAGS_ISEM)
+#define DM_SEM_FLAG_WR  (DM_FLAGS_ISEM)
+#endif
+/*
+ *      Macros to turn caller specified delay/block flags into
+ *      dm_send_xxxx_event flag DM_FLAGS_NDELAY.
+ */
+#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
+                        DM_FLAGS_NDELAY : 0)
+#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
+extern struct bhv_vfsops xfs_dmops;
+#ifdef CONFIG_XFS_DMAPI
+void xfs_dm_init(struct file_system_type *);
+void xfs_dm_exit(struct file_system_type *);
+#define XFS_DM_INIT(fstype)     xfs_dm_init(fstype)
+#define XFS_DM_EXIT(fstype)     xfs_dm_exit(fstype)
+#else
+#define XFS_DM_INIT(fstype)
+#define XFS_DM_EXIT(fstype)
+#endif
+#endif  /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
new file mode 100644
index 000000000000..cec54ba800eb
--- /dev/null
+++ b/fs/xfs/xfs_dmops.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+xfs_dmops_t     xfs_dmcore_stub = {
+        .xfs_send_data          = (xfs_send_data_t)fs_nosys,
+        .xfs_send_mmap          = (xfs_send_mmap_t)fs_noerr,
+        .xfs_send_destroy       = (xfs_send_destroy_t)fs_nosys,
+        .xfs_send_namesp        = (xfs_send_namesp_t)fs_nosys,
+        .xfs_send_unmount       = (xfs_send_unmount_t)fs_noval,
+};
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
new file mode 100644
index 000000000000..bbe1dea11c08
--- /dev/null
+++ b/fs/xfs/xfs_error.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_sb.h"
+#include "xfs_trans.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_utils.h"
+#include "xfs_error.h"
+#ifdef DEBUG
+int     xfs_etrap[XFS_ERROR_NTRAP] = {
+        0,
+};
+int
+xfs_error_trap(int e)
+{
+        int i;
+        if (!e)
+                return 0;
+        for (i = 0; i < XFS_ERROR_NTRAP; i++) {
+                if (xfs_etrap[i] == 0)
+                        break;
+                if (e != xfs_etrap[i])
+                        continue;
+                cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+                debug_stop_all_cpus((void *)-1LL);
+                BUG();
+                break;
+        }
+        return e;
+}
+#endif
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+int     xfs_etest[XFS_NUM_INJECT_ERROR];
+int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
+char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+void
+xfs_error_test_init(void)
+{
+        memset(xfs_etest, 0, sizeof(xfs_etest));
+        memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid));
+        memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname));
+}
+int
+xfs_error_test(int error_tag, int *fsidp, char *expression,
+               int line, char *file, unsigned long randfactor)
+{
+        int i;
+        int64_t fsid;
+        if (random() % randfactor)
+                return 0;
+        memcpy(&fsid, fsidp, sizeof(xfs_fsid_t));
+        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+                if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
+                        cmn_err(CE_WARN,
+        "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
+                                expression, file, line, xfs_etest_fsname[i]);
+                        return 1;
+                }
+        }
+        return 0;
+}
+int
+xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+{
+        int i;
+        int len;
+        int64_t fsid;
+        memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+                if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+                        cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+                        return 0;
+                }
+        }
+        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+                if (xfs_etest[i] == 0) {
+                        cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+                                error_tag);
+                        xfs_etest[i] = error_tag;
+                        xfs_etest_fsid[i] = fsid;
+                        len = strlen(mp->m_fsname);
+                        xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
+                        strcpy(xfs_etest_fsname[i], mp->m_fsname);
+                        return 0;
+                }
+        }
+        cmn_err(CE_WARN, "error tag overflow, too many turned on");
+        return 1;
+}
+int
+xfs_errortag_clear(int error_tag, xfs_mount_t *mp)
+{
+        int i;
+        int64_t fsid;
+        memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+                if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+                        xfs_etest[i] = 0;
+                        xfs_etest_fsid[i] = 0LL;
+                        kmem_free(xfs_etest_fsname[i],
+                                  strlen(xfs_etest_fsname[i]) + 1);
+                        xfs_etest_fsname[i] = NULL;
+                        cmn_err(CE_WARN, "Cleared XFS error tag #%d",
+                                error_tag);
+                        return 0;
+                }
+        }
+        cmn_err(CE_WARN, "XFS error tag %d not on", error_tag);
+        return 1;
+}
+int
+xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud)
+{
+        int i;
+        int cleared = 0;
+        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+                if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
+                     xfs_etest[i] != 0) {
+                        cleared = 1;
+                        cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+                                xfs_etest[i]);
+                        xfs_etest[i] = 0;
+                        xfs_etest_fsid[i] = 0LL;
+                        kmem_free(xfs_etest_fsname[i],
+                                  strlen(xfs_etest_fsname[i]) + 1);
+                        xfs_etest_fsname[i] = NULL;
+                }
+        }
+        if (loud || cleared)
+                cmn_err(CE_WARN,
+                        "Cleared all XFS error tags for filesystem \"%s\"",
+                        fsname);
+        return 0;
+}
+int
+xfs_errortag_clearall(xfs_mount_t *mp)
+{
+        int64_t fsid;
+        memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
+        return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1);
+}
+#endif /* DEBUG || INDUCE_IO_ERROR */
+static void
+xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
+{
+        if (mp != NULL) {
+                char    *newfmt;
+                int     len = 16 + mp->m_fsname_len + strlen(fmt);
+                newfmt = kmem_alloc(len, KM_SLEEP);
+                sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
+                icmn_err(level, newfmt, ap);
+                kmem_free(newfmt, len);
+        } else {
+                icmn_err(level, fmt, ap);
+        }
+}
+void
+xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        xfs_fs_vcmn_err(level, mp, fmt, ap);
+        va_end(ap);
+}
+void
+xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
+{
+        va_list ap;
+#ifdef DEBUG
+        xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+#endif
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
+            && (level & CE_ALERT)) {
+                level &= ~CE_ALERT;
+                level |= CE_PANIC;
+                cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG.");
+        }
+        va_start(ap, fmt);
+        xfs_fs_vcmn_err(level, mp, fmt, ap);
+        va_end(ap);
+}
+void
+xfs_error_report(
+        char            *tag,
+        int             level,
+        xfs_mount_t     *mp,
+        char            *fname,
+        int             linenum,
+        inst_t          *ra)
+{
+        if (level <= xfs_error_level) {
+                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
+                            CE_ALERT, mp,
+                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
+                            tag, linenum, fname, ra);
+                xfs_stack_trace();
+        }
+}
+void
+xfs_hex_dump(void *p, int length)
+{
+        __uint8_t *uip = (__uint8_t*)p;
+        int     i;
+        char    sbuf[128], *s;
+        s = sbuf;
+        *s = '\0';
+        for (i=0; i<length; i++, uip++) {
+                if ((i % 16) == 0) {
+                        if (*s != '\0')
+                                cmn_err(CE_ALERT, "%s\n", sbuf);
+                        s = sbuf;
+                        sprintf(s, "0x%x: ", i);
+                        while( *s != '\0')
+                                s++;
+                }
+                sprintf(s, "%02x ", *uip);
+                /*
+                 * the kernel sprintf is a void; user sprintf returns
+                 * the sprintf'ed string's length.  Find the new end-
+                 * of-string
+                 */
+                while( *s != '\0')
+                        s++;
+        }
+        cmn_err(CE_ALERT, "%s\n", sbuf);
+}
+void
+xfs_corruption_error(
+        char            *tag,
+        int             level,
+        xfs_mount_t     *mp,
+        void            *p,
+        char            *fname,
+        int             linenum,
+        inst_t          *ra)
+{
+        if (level <= xfs_error_level)
+                xfs_hex_dump(p, 16);
+        xfs_error_report(tag, level, mp, fname, linenum, ra);
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
new file mode 100644
index 000000000000..6bc0535c0a65
--- /dev/null
+++ b/fs/xfs/xfs_error.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ERROR_H__
+#define __XFS_ERROR_H__
+#define prdev(fmt,targ,args...) \
+        printk("XFS: device %s- " fmt "\n", XFS_BUFTARG_NAME(targ), ## args)
+#define XFS_ERECOVER    1       /* Failure to recover log */
+#define XFS_ELOGSTAT    2       /* Failure to stat log in user space */
+#define XFS_ENOLOGSPACE 3       /* Reservation too large */
+#define XFS_ENOTSUP     4       /* Operation not supported */
+#define XFS_ENOLSN      5       /* Can't find the lsn you asked for */
+#define XFS_ENOTFOUND   6
+#define XFS_ENOTXFS     7       /* Not XFS filesystem */
+#ifdef DEBUG
+#define XFS_ERROR_NTRAP 10
+extern int      xfs_etrap[XFS_ERROR_NTRAP];
+extern int      xfs_error_trap(int);
+#define XFS_ERROR(e)    xfs_error_trap(e)
+#else
+#define XFS_ERROR(e)    (e)
+#endif
+struct xfs_mount;
+extern void
+xfs_error_report(
+        char            *tag,
+        int             level,
+        struct xfs_mount *mp,
+        char            *fname,
+        int             linenum,
+        inst_t          *ra);
+extern void
+xfs_corruption_error(
+        char            *tag,
+        int             level,
+        struct xfs_mount *mp,
+        void            *p,
+        char            *fname,
+        int             linenum,
+        inst_t          *ra);
+extern void
+xfs_hex_dump(void *p, int length);
+#define XFS_ERROR_REPORT(e, lvl, mp)    \
+        xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
+#define XFS_CORRUPTION_ERROR(e, lvl, mp, mem)   \
+        xfs_corruption_error(e, lvl, mp, mem, \
+                             __FILE__, __LINE__, __return_address)
+#define XFS_ERRLEVEL_OFF        0
+#define XFS_ERRLEVEL_LOW        1
+#define XFS_ERRLEVEL_HIGH       5
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+#define XFS_ERRTAG_NOERROR                              0
+#define XFS_ERRTAG_IFLUSH_1                             1
+#define XFS_ERRTAG_IFLUSH_2                             2
+#define XFS_ERRTAG_IFLUSH_3                             3
+#define XFS_ERRTAG_IFLUSH_4                             4
+#define XFS_ERRTAG_IFLUSH_5                             5
+#define XFS_ERRTAG_IFLUSH_6                             6
+#define XFS_ERRTAG_DA_READ_BUF                          7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK                   8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK                   9
+#define XFS_ERRTAG_ALLOC_READ_AGF                       10
+#define XFS_ERRTAG_IALLOC_READ_AGI                      11
+#define XFS_ERRTAG_ITOBP_INOTOBP                        12
+#define XFS_ERRTAG_IUNLINK                              13
+#define XFS_ERRTAG_IUNLINK_REMOVE                       14
+#define XFS_ERRTAG_DIR_INO_VALIDATE                     15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK                  16
+#define XFS_ERRTAG_IODONE_IOERR                         17
+#define XFS_ERRTAG_STRATREAD_IOERR                      18
+#define XFS_ERRTAG_STRATCMPL_IOERR                      19
+#define XFS_ERRTAG_DIOWRITE_IOERR                       20
+#define XFS_ERRTAG_BMAPIFORMAT                          21
+#define XFS_ERRTAG_MAX                                  22
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT                              100
+#define XFS_RANDOM_IFLUSH_1                             XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2                             XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3                             XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4                             XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5                             XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6                             XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF                          XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK                   (XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK                   XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF                       XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI                      XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP                        XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK                              XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE                       XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE                     XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK                  XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR                         (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR                      (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR                      (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR                       (XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_BMAPIFORMAT                          XFS_RANDOM_DEFAULT
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+extern int      xfs_error_test(int, int *, char *, int, char *, unsigned long);
+void xfs_error_test_init(void);
+#define XFS_NUM_INJECT_ERROR                            10
+#ifdef __ANSI_CPP__
+#define XFS_TEST_ERROR(expr, mp, tag, rf)               \
+        ((expr) || \
+         xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
+                         (rf)))
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)               \
+        ((expr) || \
+         xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
+                        (rf)))
+#endif /* __ANSI_CPP__ */
+int             xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+int             xfs_errortag_clear(int error_tag, xfs_mount_t *mp);
+int             xfs_errortag_clearall(xfs_mount_t *mp);
+int             xfs_errortag_clearall_umount(int64_t fsid, char *fsname,
+                                                int loud);
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)       (expr)
+#define xfs_errortag_add(tag, mp)               (ENOSYS)
+#define xfs_errortag_clearall(mp)               (ENOSYS)
+#endif /* (DEBUG || INDUCE_IO_ERROR) */
+/*
+ * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
+ *                      a panic by setting xfs_panic_mask in a
+ *                      sysctl.  update xfs_max[XFS_PARAM] if
+ *                      more are added.
+ */
+#define         XFS_NO_PTAG                     0
+#define         XFS_PTAG_IFLUSH                 0x00000001
+#define         XFS_PTAG_LOGRES                 0x00000002
+#define         XFS_PTAG_AILDELETE              0x00000004
+#define         XFS_PTAG_ERROR_REPORT           0x00000008
+#define         XFS_PTAG_SHUTDOWN_CORRUPT       0x00000010
+#define         XFS_PTAG_SHUTDOWN_IOERROR       0x00000020
+#define         XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
+struct xfs_mount;
+/* PRINTFLIKE4 */
+void            xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
+                            char *fmt, ...);
+/* PRINTFLIKE3 */
+void            xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+#endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
new file mode 100644
index 000000000000..5eafd5b63211
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.c
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * This file contains the implementation of the xfs_efi_log_item
+ * and xfs_efd_log_item items.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+kmem_zone_t     *xfs_efi_zone;
+kmem_zone_t     *xfs_efd_zone;
+STATIC void     xfs_efi_item_unlock(xfs_efi_log_item_t *);
+STATIC void     xfs_efi_item_abort(xfs_efi_log_item_t *);
+STATIC void     xfs_efd_item_abort(xfs_efd_log_item_t *);
+/*
+ * This returns the number of iovecs needed to log the given efi item.
+ * We only need 1 iovec for an efi item.  It just logs the efi_log_format
+ * structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efi_item_size(xfs_efi_log_item_t *efip)
+{
+        return 1;
+}
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efi log item. We use only 1 iovec, and we point that
+ * at the efi_log_format structure embedded in the efi item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efi item have been filled.
+ */
+STATIC void
+xfs_efi_item_format(xfs_efi_log_item_t  *efip,
+                    xfs_log_iovec_t     *log_vector)
+{
+        uint    size;
+        ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+        efip->efi_format.efi_type = XFS_LI_EFI;
+        size = sizeof(xfs_efi_log_format_t);
+        size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+        efip->efi_format.efi_size = 1;
+        log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
+        log_vector->i_len = size;
+        ASSERT(size >= sizeof(xfs_efi_log_format_t));
+}
+/*
+ * Pinning has no meaning for an efi item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+{
+        return;
+}
+/*
+ * While EFIs cannot really be pinned, the unpin operation is the
+ * last place at which the EFI is manipulated during a transaction.
+ * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * free the EFI.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
+{
+        int             nexts;
+        int             size;
+        xfs_mount_t     *mp;
+        SPLDECL(s);
+        mp = efip->efi_item.li_mountp;
+        AIL_LOCK(mp, s);
+        if (efip->efi_flags & XFS_EFI_CANCELED) {
+                /*
+                 * xfs_trans_delete_ail() drops the AIL lock.
+                 */
+                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+                nexts = efip->efi_format.efi_nextents;
+                if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+                        size = sizeof(xfs_efi_log_item_t);
+                        size += (nexts - 1) * sizeof(xfs_extent_t);
+                        kmem_free(efip, size);
+                } else {
+                        kmem_zone_free(xfs_efi_zone, efip);
+                }
+        } else {
+                efip->efi_flags |= XFS_EFI_COMMITTED;
+                AIL_UNLOCK(mp, s);
+        }
+        return;
+}
+/*
+ * like unpin only we have to also clear the xaction descriptor
+ * pointing the log item if we free the item.  This routine duplicates
+ * unpin because efi_flags is protected by the AIL lock.  Freeing
+ * the descriptor and then calling unpin would force us to drop the AIL
+ * lock which would open up a race condition.
+ */
+STATIC void
+xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
+{
+        int             nexts;
+        int             size;
+        xfs_mount_t     *mp;
+        xfs_log_item_desc_t     *lidp;
+        SPLDECL(s);
+        mp = efip->efi_item.li_mountp;
+        AIL_LOCK(mp, s);
+        if (efip->efi_flags & XFS_EFI_CANCELED) {
+                /*
+                 * free the xaction descriptor pointing to this item
+                 */
+                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
+                xfs_trans_free_item(tp, lidp);
+                /*
+                 * pull the item off the AIL.
+                 * xfs_trans_delete_ail() drops the AIL lock.
+                 */
+                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+                /*
+                 * now free the item itself
+                 */
+                nexts = efip->efi_format.efi_nextents;
+                if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+                        size = sizeof(xfs_efi_log_item_t);
+                        size += (nexts - 1) * sizeof(xfs_extent_t);
+                        kmem_free(efip, size);
+                } else {
+                        kmem_zone_free(xfs_efi_zone, efip);
+                }
+        } else {
+                efip->efi_flags |= XFS_EFI_COMMITTED;
+                AIL_UNLOCK(mp, s);
+        }
+        return;
+}
+/*
+ * Efi items have no locking or pushing.  However, since EFIs are
+ * pulled from the AIL when their corresponding EFDs are committed
+ * to disk, their situation is very similar to being pinned.  Return
+ * XFS_ITEM_PINNED so that the caller will eventually flush the log.
+ * This should help in getting the EFI out of the AIL.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+{
+        return XFS_ITEM_PINNED;
+}
+/*
+ * Efi items have no locking, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+{
+        if (efip->efi_item.li_flags & XFS_LI_ABORTED)
+                xfs_efi_item_abort(efip);
+        return;
+}
+/*
+ * The EFI is logged only once and cannot be moved in the log, so
+ * simply return the lsn at which it's been logged.  The canceled
+ * flag is not paid any attention here.  Checking for that is delayed
+ * until the EFI is unpinned.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+{
+        return lsn;
+}
+/*
+ * This is called when the transaction logging the EFI is aborted.
+ * Free up the EFI and return.  No need to clean up the slot for
+ * the item in the transaction.  That was done by the unpin code
+ * which is called prior to this routine in the abort/fs-shutdown path.
+ */
+STATIC void
+xfs_efi_item_abort(xfs_efi_log_item_t *efip)
+{
+        int     nexts;
+        int     size;
+        nexts = efip->efi_format.efi_nextents;
+        if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+                size = sizeof(xfs_efi_log_item_t);
+                size += (nexts - 1) * sizeof(xfs_extent_t);
+                kmem_free(efip, size);
+        } else {
+                kmem_zone_free(xfs_efi_zone, efip);
+        }
+        return;
+}
+/*
+ * There isn't much you can do to push on an efi item.  It is simply
+ * stuck waiting for all of its corresponding efd items to be
+ * committed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_push(xfs_efi_log_item_t *efip)
+{
+        return;
+}
+/*
+ * The EFI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+{
+        return;
+}
+/*
+ * This is the ops vector shared by all efi log items.
+ */
+struct xfs_item_ops xfs_efi_item_ops = {
+        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
+        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+                                        xfs_efi_item_format,
+        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin,
+        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+                                        xfs_efi_item_unpin_remove,
+        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
+        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
+        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_efi_item_committed,
+        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efi_item_push,
+        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_efi_item_abort,
+        .iop_pushbuf    = NULL,
+        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_efi_item_committing
+};
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+xfs_efi_log_item_t *
+xfs_efi_init(xfs_mount_t        *mp,
+             uint               nextents)
+{
+        xfs_efi_log_item_t      *efip;
+        uint                    size;
+        ASSERT(nextents > 0);
+        if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+                size = (uint)(sizeof(xfs_efi_log_item_t) +
+                        ((nextents - 1) * sizeof(xfs_extent_t)));
+                efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+        } else {
+                efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
+                                                             KM_SLEEP);
+        }
+        efip->efi_item.li_type = XFS_LI_EFI;
+        efip->efi_item.li_ops = &xfs_efi_item_ops;
+        efip->efi_item.li_mountp = mp;
+        efip->efi_format.efi_nextents = nextents;
+        efip->efi_format.efi_id = (__psint_t)(void*)efip;
+        return (efip);
+}
+/*
+ * This is called by the efd item code below to release references to
+ * the given efi item.  Each efd calls this with the number of
+ * extents that it has logged, and when the sum of these reaches
+ * the total number of extents logged by this efi item we can free
+ * the efi item.
+ *
+ * Freeing the efi item requires that we remove it from the AIL.
+ * We'll use the AIL lock to protect our counters as well as
+ * the removal from the AIL.
+ */
+void
+xfs_efi_release(xfs_efi_log_item_t      *efip,
+                uint                    nextents)
+{
+        xfs_mount_t     *mp;
+        int             extents_left;
+        uint            size;
+        int             nexts;
+        SPLDECL(s);
+        mp = efip->efi_item.li_mountp;
+        ASSERT(efip->efi_next_extent > 0);
+        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
+        AIL_LOCK(mp, s);
+        ASSERT(efip->efi_next_extent >= nextents);
+        efip->efi_next_extent -= nextents;
+        extents_left = efip->efi_next_extent;
+        if (extents_left == 0) {
+                /*
+                 * xfs_trans_delete_ail() drops the AIL lock.
+                 */
+                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+        } else {
+                AIL_UNLOCK(mp, s);
+        }
+        if (extents_left == 0) {
+                nexts = efip->efi_format.efi_nextents;
+                if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+                        size = sizeof(xfs_efi_log_item_t);
+                        size += (nexts - 1) * sizeof(xfs_extent_t);
+                        kmem_free(efip, size);
+                } else {
+                        kmem_zone_free(xfs_efi_zone, efip);
+                }
+        }
+}
+/*
+ * This is called when the transaction that should be committing the
+ * EFD corresponding to the given EFI is aborted.  The committed and
+ * canceled flags are used to coordinate the freeing of the EFI and
+ * the references by the transaction that committed it.
+ */
+STATIC void
+xfs_efi_cancel(
+        xfs_efi_log_item_t      *efip)
+{
+        int             nexts;
+        int             size;
+        xfs_mount_t     *mp;
+        SPLDECL(s);
+        mp = efip->efi_item.li_mountp;
+        AIL_LOCK(mp, s);
+        if (efip->efi_flags & XFS_EFI_COMMITTED) {
+                /*
+                 * xfs_trans_delete_ail() drops the AIL lock.
+                 */
+                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+                nexts = efip->efi_format.efi_nextents;
+                if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+                        size = sizeof(xfs_efi_log_item_t);
+                        size += (nexts - 1) * sizeof(xfs_extent_t);
+                        kmem_free(efip, size);
+                } else {
+                        kmem_zone_free(xfs_efi_zone, efip);
+                }
+        } else {
+                efip->efi_flags |= XFS_EFI_CANCELED;
+                AIL_UNLOCK(mp, s);
+        }
+        return;
+}
+/*
+ * This returns the number of iovecs needed to log the given efd item.
+ * We only need 1 iovec for an efd item.  It just logs the efd_log_format
+ * structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+{
+        return 1;
+}
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efd log item. We use only 1 iovec, and we point that
+ * at the efd_log_format structure embedded in the efd item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efd item have been filled.
+ */
+STATIC void
+xfs_efd_item_format(xfs_efd_log_item_t  *efdp,
+                    xfs_log_iovec_t     *log_vector)
+{
+        uint    size;
+        ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+        efdp->efd_format.efd_type = XFS_LI_EFD;
+        size = sizeof(xfs_efd_log_format_t);
+        size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+        efdp->efd_format.efd_size = 1;
+        log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
+        log_vector->i_len = size;
+        ASSERT(size >= sizeof(xfs_efd_log_format_t));
+}
+/*
+ * Pinning has no meaning for an efd item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+{
+        return;
+}
+/*
+ * Since pinning has no meaning for an efd item, unpinning does
+ * not either.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale)
+{
+        return;
+}
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+{
+        return;
+}
+/*
+ * Efd items have no locking, so just return success.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+{
+        return XFS_ITEM_LOCKED;
+}
+/*
+ * Efd items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+{
+        if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
+                xfs_efd_item_abort(efdp);
+        return;
+}
+/*
+ * When the efd item is committed to disk, all we need to do
+ * is delete our reference to our partner efi item and then
+ * free ourselves.  Since we're freeing ourselves we must
+ * return -1 to keep the transaction code from further referencing
+ * this item.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+{
+        uint    size;
+        int     nexts;
+        /*
+         * If we got a log I/O error, it's always the case that the LR with the
+         * EFI got unpinned and freed before the EFD got aborted.
+         */
+        if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+                xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
+        nexts = efdp->efd_format.efd_nextents;
+        if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+                size = sizeof(xfs_efd_log_item_t);
+                size += (nexts - 1) * sizeof(xfs_extent_t);
+                kmem_free(efdp, size);
+        } else {
+                kmem_zone_free(xfs_efd_zone, efdp);
+        }
+        return (xfs_lsn_t)-1;
+}
+/*
+ * The transaction of which this EFD is a part has been aborted.
+ * Inform its companion EFI of this fact and then clean up after
+ * ourselves.  No need to clean up the slot for the item in the
+ * transaction.  That was done by the unpin code which is called
+ * prior to this routine in the abort/fs-shutdown path.
+ */
+STATIC void
+xfs_efd_item_abort(xfs_efd_log_item_t *efdp)
+{
+        int     nexts;
+        int     size;
+        /*
+         * If we got a log I/O error, it's always the case that the LR with the
+         * EFI got unpinned and freed before the EFD got aborted. So don't
+         * reference the EFI at all in that case.
+         */
+        if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+                xfs_efi_cancel(efdp->efd_efip);
+        nexts = efdp->efd_format.efd_nextents;
+        if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+                size = sizeof(xfs_efd_log_item_t);
+                size += (nexts - 1) * sizeof(xfs_extent_t);
+                kmem_free(efdp, size);
+        } else {
+                kmem_zone_free(xfs_efd_zone, efdp);
+        }
+        return;
+}
+/*
+ * There isn't much you can do to push on an efd item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_push(xfs_efd_log_item_t *efdp)
+{
+        return;
+}
+/*
+ * The EFD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+{
+        return;
+}
+/*
+ * This is the ops vector shared by all efd log items.
+ */
+struct xfs_item_ops xfs_efd_item_ops = {
+        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
+        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+                                        xfs_efd_item_format,
+        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin,
+        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+                                        xfs_efd_item_unpin_remove,
+        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
+        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
+        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_efd_item_committed,
+        .iop_push       = (void(*)(xfs_log_item_t*))xfs_efd_item_push,
+        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_efd_item_abort,
+        .iop_pushbuf    = NULL,
+        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_efd_item_committing
+};
+/*
+ * Allocate and initialize an efd item with the given number of extents.
+ */
+xfs_efd_log_item_t *
+xfs_efd_init(xfs_mount_t        *mp,
+             xfs_efi_log_item_t *efip,
+             uint               nextents)
+{
+        xfs_efd_log_item_t      *efdp;
+        uint                    size;
+        ASSERT(nextents > 0);
+        if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+                size = (uint)(sizeof(xfs_efd_log_item_t) +
+                        ((nextents - 1) * sizeof(xfs_extent_t)));
+                efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+        } else {
+                efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
+                                                             KM_SLEEP);
+        }
+        efdp->efd_item.li_type = XFS_LI_EFD;
+        efdp->efd_item.li_ops = &xfs_efd_item_ops;
+        efdp->efd_item.li_mountp = mp;
+        efdp->efd_efip = efip;
+        efdp->efd_format.efd_nextents = nextents;
+        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+        return (efdp);
+}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
new file mode 100644
index 000000000000..7122d6101d15
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_EXTFREE_ITEM_H__
+#define __XFS_EXTFREE_ITEM_H__
+struct xfs_mount;
+struct kmem_zone;
+typedef struct xfs_extent {
+        xfs_dfsbno_t    ext_start;
+        xfs_extlen_t    ext_len;
+} xfs_extent_t;
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.  The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+        unsigned short          efi_type;       /* efi log item type */
+        unsigned short          efi_size;       /* size of this item */
+        uint                    efi_nextents;   /* # extents to free */
+        __uint64_t              efi_id;         /* efi identifier */
+        xfs_extent_t            efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.  The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+        unsigned short          efd_type;       /* efd log item type */
+        unsigned short          efd_size;       /* size of this item */
+        uint                    efd_nextents;   /* # of extents freed */
+        __uint64_t              efd_efi_id;     /* id of corresponding efi */
+        xfs_extent_t            efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+#ifdef __KERNEL__
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFI_MAX_FAST_EXTENTS        16
+/*
+ * Define EFI flags.
+ */
+#define XFS_EFI_RECOVERED       0x1
+#define XFS_EFI_COMMITTED       0x2
+#define XFS_EFI_CANCELED        0x4
+/*
+ * This is the "extent free intention" log item.  It is used
+ * to log the fact that some extents need to be free.  It is
+ * used in conjunction with the "extent free done" log item
+ * described below.
+ */
+typedef struct xfs_efi_log_item {
+        xfs_log_item_t          efi_item;
+        uint                    efi_flags;      /* misc flags */
+        uint                    efi_next_extent;
+        xfs_efi_log_format_t    efi_format;
+} xfs_efi_log_item_t;
+/*
+ * This is the "extent free done" log item.  It is used to log
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+typedef struct xfs_efd_log_item {
+        xfs_log_item_t          efd_item;
+        xfs_efi_log_item_t      *efd_efip;
+        uint                    efd_next_extent;
+        xfs_efd_log_format_t    efd_format;
+} xfs_efd_log_item_t;
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFD_MAX_FAST_EXTENTS        16
+extern struct kmem_zone *xfs_efi_zone;
+extern struct kmem_zone *xfs_efd_zone;
+xfs_efi_log_item_t      *xfs_efi_init(struct xfs_mount *, uint);
+xfs_efd_log_item_t      *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
+                                      uint);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
new file mode 100644
index 000000000000..6ee8443bf9d3
--- /dev/null
+++ b/fs/xfs/xfs_fs.h
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 1995-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307,
+ * USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_FS_H__
+#define __XFS_FS_H__
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+#define XFS_NAME        "xfs"
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+#ifndef HAVE_DIOATTR
+struct dioattr {
+        __u32           d_mem;          /* data buffer memory alignment */
+        __u32           d_miniosz;      /* min xfer size                */
+        __u32           d_maxiosz;      /* max xfer size                */
+};
+#endif
+/*
+ * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
+ */
+#ifndef HAVE_FSXATTR
+struct fsxattr {
+        __u32           fsx_xflags;     /* xflags field value (get/set) */
+        __u32           fsx_extsize;    /* extsize field value (get/set)*/
+        __u32           fsx_nextents;   /* nextents field value (get)   */
+        unsigned char   fsx_pad[16];
+};
+#endif
+/*
+ * Flags for the bs_xflags/fsx_xflags field
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_DIFLAG_s.
+ */
+#define XFS_XFLAG_REALTIME      0x00000001      /* data in realtime volume */
+#define XFS_XFLAG_PREALLOC      0x00000002      /* preallocated file extents */
+#define XFS_XFLAG_IMMUTABLE     0x00000008      /* file cannot be modified */
+#define XFS_XFLAG_APPEND        0x00000010      /* all writes append */
+#define XFS_XFLAG_SYNC          0x00000020      /* all writes synchronous */
+#define XFS_XFLAG_NOATIME       0x00000040      /* do not update access time */
+#define XFS_XFLAG_NODUMP        0x00000080      /* do not include in backups */
+#define XFS_XFLAG_RTINHERIT     0x00000100      /* create with rt bit set */
+#define XFS_XFLAG_PROJINHERIT   0x00000200      /* create with parents projid */
+#define XFS_XFLAG_NOSYMLINKS    0x00000400      /* disallow symlink creation */
+#define XFS_XFLAG_HASATTR       0x80000000      /* no DIFLAG for this   */
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entry with the
+ * number of array elements given.  The first structure is updated on
+ * return to give the offset and length for the next call.
+ */
+#ifndef HAVE_GETBMAP
+struct getbmap {
+        __s64           bmv_offset;     /* file offset of segment in blocks */
+        __s64           bmv_block;      /* starting block (64-bit daddr_t)  */
+        __s64           bmv_length;     /* length of segment, blocks        */
+        __s32           bmv_count;      /* # of entries in array incl. 1st  */
+        __s32           bmv_entries;    /* # of entries filled in (output)  */
+};
+#endif
+/*
+ *      Structure for XFS_IOC_GETBMAPX.  Fields bmv_offset through bmv_entries
+ *      are used exactly as in the getbmap structure.  The getbmapx structure
+ *      has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ *      is only used for the first structure.  It contains input flags
+ *      specifying XFS_IOC_GETBMAPX actions.  The bmv_oflags field is filled
+ *      in by the XFS_IOC_GETBMAPX command for each returned structure after
+ *      the first.
+ */
+#ifndef HAVE_GETBMAPX
+struct getbmapx {
+        __s64           bmv_offset;     /* file offset of segment in blocks */
+        __s64           bmv_block;      /* starting block (64-bit daddr_t)  */
+        __s64           bmv_length;     /* length of segment, blocks        */
+        __s32           bmv_count;      /* # of entries in array incl. 1st  */
+        __s32           bmv_entries;    /* # of entries filled in (output). */
+        __s32           bmv_iflags;     /* input flags (1st structure)      */
+        __s32           bmv_oflags;     /* output flags (after 1st structure)*/
+        __s32           bmv_unused1;    /* future use                       */
+        __s32           bmv_unused2;    /* future use                       */
+};
+#endif
+/*      bmv_iflags values - set by XFS_IOC_GETBMAPX caller.     */
+#define BMV_IF_ATTRFORK         0x1     /* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ    0x2     /* Do not generate DMAPI read event  */
+#define BMV_IF_PREALLOC         0x4     /* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_VALID    (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
+#ifdef __KERNEL__
+#define BMV_IF_EXTENDED 0x40000000      /* getpmapx if set */
+#endif
+/*      bmv_oflags values - returned for for each non-header segment */
+#define BMV_OF_PREALLOC         0x1     /* segment = unwritten pre-allocation */
+/*      Convert getbmap <-> getbmapx - move fields from p1 to p2. */
+#define GETBMAP_CONVERT(p1,p2) {        \
+        p2.bmv_offset = p1.bmv_offset;  \
+        p2.bmv_block = p1.bmv_block;    \
+        p2.bmv_length = p1.bmv_length;  \
+        p2.bmv_count = p1.bmv_count;    \
+        p2.bmv_entries = p1.bmv_entries;  }
+/*
+ * Structure for XFS_IOC_FSSETDM.
+ * For use by backup and restore programs to set the XFS on-disk inode
+ * fields di_dmevmask and di_dmstate.  These must be set to exactly and
+ * only values previously obtained via xfs_bulkstat!  (Specifically the
+ * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ */
+#ifndef HAVE_FSDMIDATA
+struct fsdmidata {
+        __u32           fsd_dmevmask;   /* corresponds to di_dmevmask */
+        __u16           fsd_padding;
+        __u16           fsd_dmstate;    /* corresponds to di_dmstate  */
+};
+#endif
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start;
+        __s64           l_len;          /* len == 0 means until end of file */
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area                     */
+} xfs_flock64_t;
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+typedef struct xfs_fsop_geom_v1 {
+        __u32           blocksize;      /* filesystem (data) block size */
+        __u32           rtextsize;      /* realtime extent size         */
+        __u32           agblocks;       /* fsblocks in an AG            */
+        __u32           agcount;        /* number of allocation groups  */
+        __u32           logblocks;      /* fsblocks in the log          */
+        __u32           sectsize;       /* (data) sector size, bytes    */
+        __u32           inodesize;      /* inode size in bytes          */
+        __u32           imaxpct;        /* max allowed inode space(%)   */
+        __u64           datablocks;     /* fsblocks in data subvolume   */
+        __u64           rtblocks;       /* fsblocks in realtime subvol  */
+        __u64           rtextents;      /* rt extents in realtime subvol*/
+        __u64           logstart;       /* starting fsblock of the log  */
+        unsigned char   uuid[16];       /* unique id of the filesystem  */
+        __u32           sunit;          /* stripe unit, fsblocks        */
+        __u32           swidth;         /* stripe width, fsblocks       */
+        __s32           version;        /* structure version            */
+        __u32           flags;          /* superblock version flags     */
+        __u32           logsectsize;    /* log sector size, bytes       */
+        __u32           rtsectsize;     /* realtime sector size, bytes  */
+        __u32           dirblocksize;   /* directory block size, bytes  */
+} xfs_fsop_geom_v1_t;
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+typedef struct xfs_fsop_geom {
+        __u32           blocksize;      /* filesystem (data) block size */
+        __u32           rtextsize;      /* realtime extent size         */
+        __u32           agblocks;       /* fsblocks in an AG            */
+        __u32           agcount;        /* number of allocation groups  */
+        __u32           logblocks;      /* fsblocks in the log          */
+        __u32           sectsize;       /* (data) sector size, bytes    */
+        __u32           inodesize;      /* inode size in bytes          */
+        __u32           imaxpct;        /* max allowed inode space(%)   */
+        __u64           datablocks;     /* fsblocks in data subvolume   */
+        __u64           rtblocks;       /* fsblocks in realtime subvol  */
+        __u64           rtextents;      /* rt extents in realtime subvol*/
+        __u64           logstart;       /* starting fsblock of the log  */
+        unsigned char   uuid[16];       /* unique id of the filesystem  */
+        __u32           sunit;          /* stripe unit, fsblocks        */
+        __u32           swidth;         /* stripe width, fsblocks       */
+        __s32           version;        /* structure version            */
+        __u32           flags;          /* superblock version flags     */
+        __u32           logsectsize;    /* log sector size, bytes       */
+        __u32           rtsectsize;     /* realtime sector size, bytes  */
+        __u32           dirblocksize;   /* directory block size, bytes  */
+        __u32           logsunit;       /* log stripe unit, bytes */
+} xfs_fsop_geom_t;
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+        __u64   freedata;       /* free data section blocks */
+        __u64   freertx;        /* free rt extents */
+        __u64   freeino;        /* free inodes */
+        __u64   allocino;       /* total allocated inodes */
+} xfs_fsop_counts_t;
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+        __u64  resblks;
+        __u64  resblks_avail;
+} xfs_fsop_resblks_t;
+#define XFS_FSOP_GEOM_VERSION   0
+#define XFS_FSOP_GEOM_FLAGS_ATTR        0x0001  /* attributes in use    */
+#define XFS_FSOP_GEOM_FLAGS_NLINK       0x0002  /* 32-bit nlink values  */
+#define XFS_FSOP_GEOM_FLAGS_QUOTA       0x0004  /* quotas enabled       */
+#define XFS_FSOP_GEOM_FLAGS_IALIGN      0x0008  /* inode alignment      */
+#define XFS_FSOP_GEOM_FLAGS_DALIGN      0x0010  /* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED      0x0020  /* read-only shared     */
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG      0x0040  /* special extent flag  */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2       0x0080  /* directory version 2  */
+#define XFS_FSOP_GEOM_FLAGS_LOGV2       0x0100  /* log format version 2 */
+#define XFS_FSOP_GEOM_FLAGS_SECTOR      0x0200  /* sector sizes >1BB    */
+/*
+ * Minimum and maximum sizes need for growth checks
+ */
+#define XFS_MIN_AG_BLOCKS       64
+#define XFS_MIN_LOG_BLOCKS      512
+#define XFS_MAX_LOG_BLOCKS      (64 * 1024)
+#define XFS_MIN_LOG_BYTES       (256 * 1024)
+#define XFS_MAX_LOG_BYTES       (128 * 1024 * 1024)
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+        __u64           newblocks;      /* new data subvol size, fsblocks */
+        __u32           imaxpct;        /* new inode space percentage limit */
+} xfs_growfs_data_t;
+typedef struct xfs_growfs_log {
+        __u32           newblocks;      /* new log size, fsblocks */
+        __u32           isint;          /* 1 if new log is internal */
+} xfs_growfs_log_t;
+typedef struct xfs_growfs_rt {
+        __u64           newblocks;      /* new realtime size, fsblocks */
+        __u32           extsize;        /* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+        time_t          tv_sec;         /* seconds              */
+        __s32           tv_nsec;        /* and nanoseconds      */
+} xfs_bstime_t;
+typedef struct xfs_bstat {
+        __u64           bs_ino;         /* inode number                 */
+        __u16           bs_mode;        /* type and mode                */
+        __u16           bs_nlink;       /* number of links              */
+        __u32           bs_uid;         /* user id                      */
+        __u32           bs_gid;         /* group id                     */
+        __u32           bs_rdev;        /* device value                 */
+        __s32           bs_blksize;     /* block size                   */
+        __s64           bs_size;        /* file size                    */
+        xfs_bstime_t    bs_atime;       /* access time                  */
+        xfs_bstime_t    bs_mtime;       /* modify time                  */
+        xfs_bstime_t    bs_ctime;       /* inode change time            */
+        int64_t         bs_blocks;      /* number of blocks             */
+        __u32           bs_xflags;      /* extended flags               */
+        __s32           bs_extsize;     /* extent size                  */
+        __s32           bs_extents;     /* number of extents            */
+        __u32           bs_gen;         /* generation count             */
+        __u16           bs_projid;      /* project id                   */
+        unsigned char   bs_pad[14];     /* pad space, unused            */
+        __u32           bs_dmevmask;    /* DMIG event mask              */
+        __u16           bs_dmstate;     /* DMIG state info              */
+        __u16           bs_aextents;    /* attribute number of extents  */
+} xfs_bstat_t;
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+typedef struct xfs_fsop_bulkreq {
+        __u64           __user *lastip; /* last inode # pointer         */
+        __s32           icount;         /* count of entries in buffer   */
+        void            __user *ubuffer;/* user buffer for inode desc.  */
+        __s32           __user *ocount; /* output count pointer         */
+} xfs_fsop_bulkreq_t;
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+typedef struct xfs_inogrp {
+        __u64           xi_startino;    /* starting inode number        */
+        __s32           xi_alloccount;  /* # bits set in allocmask      */
+        __u64           xi_allocmask;   /* mask of allocated inodes     */
+} xfs_inogrp_t;
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+        __s32           fd;
+        __s32           errtag;
+} xfs_error_injection_t;
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+        __u32           fd;             /* fd for FD_TO_HANDLE          */
+        void            __user *path;   /* user pathname                */
+        __u32           oflags;         /* open flags                   */
+        void            __user *ihandle;/* user supplied handle         */
+        __u32           ihandlen;       /* user supplied length         */
+        void            __user *ohandle;/* user buffer for handle       */
+        __u32           __user *ohandlen;/* user buffer length          */
+} xfs_fsop_handlereq_t;
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
+ *           XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+typedef struct xfs_fsop_setdm_handlereq {
+        struct xfs_fsop_handlereq       hreq;   /* handle information   */
+        struct fsdmidata                __user *data;   /* DMAPI data   */
+} xfs_fsop_setdm_handlereq_t;
+typedef struct xfs_attrlist_cursor {
+        __u32           opaque[4];
+} xfs_attrlist_cursor_t;
+typedef struct xfs_fsop_attrlist_handlereq {
+        struct xfs_fsop_handlereq       hreq; /* handle interface structure */
+        struct xfs_attrlist_cursor      pos; /* opaque cookie, list offset */
+        __u32                           flags;  /* which namespace to use */
+        __u32                           buflen; /* length of buffer supplied */
+        void                            __user *buffer; /* returned names */
+} xfs_fsop_attrlist_handlereq_t;
+typedef struct xfs_attr_multiop {
+        __u32           am_opcode;
+        __s32           am_error;
+        void            __user *am_attrname;
+        void            __user *am_attrvalue;
+        __u32           am_length;
+        __u32           am_flags;
+} xfs_attr_multiop_t;
+typedef struct xfs_fsop_attrmulti_handlereq {
+        struct xfs_fsop_handlereq       hreq; /* handle interface structure */
+        __u32                           opcount;/* count of following multiop */
+        struct xfs_attr_multiop         __user *ops; /* attr_multi data */
+} xfs_fsop_attrmulti_handlereq_t;
+/*
+ * per machine unique filesystem identifier types.
+ */
+typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */
+#ifndef HAVE_FID
+#define MAXFIDSZ        46
+typedef struct fid {
+        __u16           fid_len;                /* length of data in bytes */
+        unsigned char   fid_data[MAXFIDSZ];     /* data (fid_len worth)  */
+} fid_t;
+#endif
+typedef struct xfs_fid {
+        __u16   xfs_fid_len;            /* length of remainder  */
+        __u16   xfs_fid_pad;
+        __u32   xfs_fid_gen;            /* generation number    */
+        __u64   xfs_fid_ino;            /* 64 bits inode number */
+} xfs_fid_t;
+typedef struct xfs_fid2 {
+        __u16   fid_len;        /* length of remainder */
+        __u16   fid_pad;        /* padding, must be zero */
+        __u32   fid_gen;        /* generation number */
+        __u64   fid_ino;        /* inode number */
+} xfs_fid2_t;
+typedef struct xfs_handle {
+        union {
+                __s64       align;      /* force alignment of ha_fid     */
+                xfs_fsid_t  _ha_fsid;   /* unique file system identifier */
+        } ha_u;
+        xfs_fid_t       ha_fid;         /* file system specific file ID  */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+#define XFS_HSIZE(handle)       (((char *) &(handle).ha_fid.xfs_fid_pad  \
+                                 - (char *) &(handle))                    \
+                                 + (handle).ha_fid.xfs_fid_len)
+#define XFS_HANDLE_CMP(h1, h2)  memcmp(h1, h2, sizeof(xfs_handle_t))
+#define FSHSIZE         sizeof(fsid_t)
+/* 
+ * Flags for going down operation
+ */
+#define XFS_FSOP_GOING_FLAGS_DEFAULT            0x0     /* going down */
+#define XFS_FSOP_GOING_FLAGS_LOGFLUSH           0x1     /* flush log but not data */
+#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH         0x2     /* don't flush log nor data */
+/*
+ * ioctl commands that are used by Linux filesystems
+ */
+#define XFS_IOC_GETXFLAGS       _IOR('f', 1, long)
+#define XFS_IOC_SETXFLAGS       _IOW('f', 2, long)
+#define XFS_IOC_GETVERSION      _IOR('v', 1, long)
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+#define XFS_IOC_ALLOCSP         _IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP          _IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_DIOINFO         _IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR      _IOR ('X', 31, struct fsxattr)
+#define XFS_IOC_FSSETXATTR      _IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_ALLOCSP64       _IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64        _IOW ('X', 37, struct xfs_flock64)
+#define XFS_IOC_GETBMAP         _IOWR('X', 38, struct getbmap)
+#define XFS_IOC_FSSETDM         _IOW ('X', 39, struct fsdmidata)
+#define XFS_IOC_RESVSP          _IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP        _IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64        _IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64      _IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA        _IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA     _IOR ('X', 45, struct fsxattr)
+/*      XFS_IOC_SETBIOSIZE ---- deprecated 46      */
+/*      XFS_IOC_GETBIOSIZE ---- deprecated 47      */
+#define XFS_IOC_GETBMAPX        _IOWR('X', 56, struct getbmap)
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1        _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT           _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE    _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS           _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE     _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE       _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE         _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE       _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE   _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT              _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA         _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG          _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT           _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS             _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS          _IOWR('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS          _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION      _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL       _IOW ('X', 117, struct xfs_error_injection)
+/*      XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
+#define XFS_IOC_FREEZE               _IOWR('X', 119, int)
+#define XFS_IOC_THAW                 _IOWR('X', 120, int)
+#define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+#define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY           _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_GOINGDOWN            _IOR ('X', 125, __uint32_t)
+/*      XFS_IOC_GETFSUUID ---------- deprecated 140      */
+#ifndef HAVE_BBMACROS
+/*
+ * Block I/O parameterization.  A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512.  Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT         9
+#define BBSIZE          (1<<BBSHIFT)
+#define BBMASK          (BBSIZE-1)
+#define BTOBB(bytes)    (((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes)   ((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs)      ((bbs) << BBSHIFT)
+#endif
+#endif  /* __XFS_FS_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
new file mode 100644
index 000000000000..21213057c27f
--- /dev/null
+++ b/fs/xfs/xfs_fsops.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_fsops.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_refcache.h"
+#include "xfs_trans_space.h"
+#include "xfs_rtalloc.h"
+#include "xfs_dir2.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+/*
+ * File system operations
+ */
+int
+xfs_fs_geometry(
+        xfs_mount_t             *mp,
+        xfs_fsop_geom_t         *geo,
+        int                     new_version)
+{
+        geo->blocksize = mp->m_sb.sb_blocksize;
+        geo->rtextsize = mp->m_sb.sb_rextsize;
+        geo->agblocks = mp->m_sb.sb_agblocks;
+        geo->agcount = mp->m_sb.sb_agcount;
+        geo->logblocks = mp->m_sb.sb_logblocks;
+        geo->sectsize = mp->m_sb.sb_sectsize;
+        geo->inodesize = mp->m_sb.sb_inodesize;
+        geo->imaxpct = mp->m_sb.sb_imax_pct;
+        geo->datablocks = mp->m_sb.sb_dblocks;
+        geo->rtblocks = mp->m_sb.sb_rblocks;
+        geo->rtextents = mp->m_sb.sb_rextents;
+        geo->logstart = mp->m_sb.sb_logstart;
+        ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
+        memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
+        if (new_version >= 2) {
+                geo->sunit = mp->m_sb.sb_unit;
+                geo->swidth = mp->m_sb.sb_width;
+        }
+        if (new_version >= 3) {
+                geo->version = XFS_FSOP_GEOM_VERSION;
+                geo->flags =
+                        (XFS_SB_VERSION_HASATTR(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
+                        (XFS_SB_VERSION_HASNLINK(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
+                        (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
+                        (XFS_SB_VERSION_HASALIGN(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
+                        (XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
+                        (XFS_SB_VERSION_HASSHARED(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
+                        (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
+                        (XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
+                        (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_SECTOR : 0);
+                geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ?
+                                mp->m_sb.sb_logsectsize : BBSIZE;
+                geo->rtsectsize = mp->m_sb.sb_blocksize;
+                geo->dirblocksize = mp->m_dirblksize;
+        }
+        if (new_version >= 4) {
+                geo->flags |=
+                        (XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
+                geo->logsunit = mp->m_sb.sb_logsunit;
+        }
+        return 0;
+}
+static int
+xfs_growfs_data_private(
+        xfs_mount_t             *mp,            /* mount point for filesystem */
+        xfs_growfs_data_t       *in)            /* growfs data input struct */
+{
+        xfs_agf_t               *agf;
+        xfs_agi_t               *agi;
+        xfs_agnumber_t          agno;
+        xfs_extlen_t            agsize;
+        xfs_extlen_t            tmpsize;
+        xfs_alloc_rec_t         *arec;
+        xfs_btree_sblock_t      *block;
+        xfs_buf_t               *bp;
+        int                     bucket;
+        int                     dpct;
+        int                     error;
+        xfs_agnumber_t          nagcount;
+        xfs_agnumber_t          nagimax = 0;
+        xfs_rfsblock_t          nb, nb_mod;
+        xfs_rfsblock_t          new;
+        xfs_rfsblock_t          nfree;
+        xfs_agnumber_t          oagcount;
+        int                     pct;
+        xfs_sb_t                *sbp;
+        xfs_trans_t             *tp;
+        nb = in->newblocks;
+        pct = in->imaxpct;
+        if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+                return XFS_ERROR(EINVAL);
+        dpct = pct - mp->m_sb.sb_imax_pct;
+        error = xfs_read_buf(mp, mp->m_ddev_targp,
+                        XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
+                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+        if (error)
+                return error;
+        ASSERT(bp);
+        xfs_buf_relse(bp);
+        new = nb;       /* use new as a temporary here */
+        nb_mod = do_div(new, mp->m_sb.sb_agblocks);
+        nagcount = new + (nb_mod != 0);
+        if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+                nagcount--;
+                nb = nagcount * mp->m_sb.sb_agblocks;
+                if (nb < mp->m_sb.sb_dblocks)
+                        return XFS_ERROR(EINVAL);
+        }
+        new = nb - mp->m_sb.sb_dblocks;
+        oagcount = mp->m_sb.sb_agcount;
+        if (nagcount > oagcount) {
+                down_write(&mp->m_peraglock);
+                mp->m_perag = kmem_realloc(mp->m_perag,
+                        sizeof(xfs_perag_t) * nagcount,
+                        sizeof(xfs_perag_t) * oagcount,
+                        KM_SLEEP);
+                memset(&mp->m_perag[oagcount], 0,
+                        (nagcount - oagcount) * sizeof(xfs_perag_t));
+                mp->m_flags |= XFS_MOUNT_32BITINODES;
+                nagimax = xfs_initialize_perag(mp, nagcount);
+                up_write(&mp->m_peraglock);
+        }
+        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+        if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
+                        XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        nfree = 0;
+        for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+                /*
+                 * AG freelist header block
+                 */
+                bp = xfs_buf_get(mp->m_ddev_targp,
+                                  XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                                  XFS_FSS_TO_BB(mp, 1), 0);
+                agf = XFS_BUF_TO_AGF(bp);
+                memset(agf, 0, mp->m_sb.sb_sectsize);
+                INT_SET(agf->agf_magicnum, ARCH_CONVERT, XFS_AGF_MAGIC);
+                INT_SET(agf->agf_versionnum, ARCH_CONVERT, XFS_AGF_VERSION);
+                INT_SET(agf->agf_seqno, ARCH_CONVERT, agno);
+                if (agno == nagcount - 1)
+                        agsize =
+                                nb -
+                                (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+                else
+                        agsize = mp->m_sb.sb_agblocks;
+                INT_SET(agf->agf_length, ARCH_CONVERT, agsize);
+                INT_SET(agf->agf_roots[XFS_BTNUM_BNOi], ARCH_CONVERT,
+                        XFS_BNO_BLOCK(mp));
+                INT_SET(agf->agf_roots[XFS_BTNUM_CNTi], ARCH_CONVERT,
+                        XFS_CNT_BLOCK(mp));
+                INT_SET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT, 1);
+                INT_SET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT, 1);
+                agf->agf_flfirst = 0;
+                INT_SET(agf->agf_fllast, ARCH_CONVERT, XFS_AGFL_SIZE(mp) - 1);
+                agf->agf_flcount = 0;
+                tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+                INT_SET(agf->agf_freeblks, ARCH_CONVERT, tmpsize);
+                INT_SET(agf->agf_longest, ARCH_CONVERT, tmpsize);
+                error = xfs_bwrite(mp, bp);
+                if (error) {
+                        goto error0;
+                }
+                /*
+                 * AG inode header block
+                 */
+                bp = xfs_buf_get(mp->m_ddev_targp,
+                                  XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                                  XFS_FSS_TO_BB(mp, 1), 0);
+                agi = XFS_BUF_TO_AGI(bp);
+                memset(agi, 0, mp->m_sb.sb_sectsize);
+                INT_SET(agi->agi_magicnum, ARCH_CONVERT, XFS_AGI_MAGIC);
+                INT_SET(agi->agi_versionnum, ARCH_CONVERT, XFS_AGI_VERSION);
+                INT_SET(agi->agi_seqno, ARCH_CONVERT, agno);
+                INT_SET(agi->agi_length, ARCH_CONVERT, agsize);
+                agi->agi_count = 0;
+                INT_SET(agi->agi_root, ARCH_CONVERT, XFS_IBT_BLOCK(mp));
+                INT_SET(agi->agi_level, ARCH_CONVERT, 1);
+                agi->agi_freecount = 0;
+                INT_SET(agi->agi_newino, ARCH_CONVERT, NULLAGINO);
+                INT_SET(agi->agi_dirino, ARCH_CONVERT, NULLAGINO);
+                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+                        INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT,
+                                NULLAGINO);
+                error = xfs_bwrite(mp, bp);
+                if (error) {
+                        goto error0;
+                }
+                /*
+                 * BNO btree root block
+                 */
+                bp = xfs_buf_get(mp->m_ddev_targp,
+                        XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+                        BTOBB(mp->m_sb.sb_blocksize), 0);
+                block = XFS_BUF_TO_SBLOCK(bp);
+                memset(block, 0, mp->m_sb.sb_blocksize);
+                INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTB_MAGIC);
+                block->bb_level = 0;
+                INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+                INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+                INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+                arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc,
+                        block, 1, mp->m_alloc_mxr[0]);
+                INT_SET(arec->ar_startblock, ARCH_CONVERT,
+                        XFS_PREALLOC_BLOCKS(mp));
+                INT_SET(arec->ar_blockcount, ARCH_CONVERT,
+                        agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
+                error = xfs_bwrite(mp, bp);
+                if (error) {
+                        goto error0;
+                }
+                /*
+                 * CNT btree root block
+                 */
+                bp = xfs_buf_get(mp->m_ddev_targp,
+                        XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+                        BTOBB(mp->m_sb.sb_blocksize), 0);
+                block = XFS_BUF_TO_SBLOCK(bp);
+                memset(block, 0, mp->m_sb.sb_blocksize);
+                INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTC_MAGIC);
+                block->bb_level = 0;
+                INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+                INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+                INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+                arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc,
+                        block, 1, mp->m_alloc_mxr[0]);
+                INT_SET(arec->ar_startblock, ARCH_CONVERT,
+                        XFS_PREALLOC_BLOCKS(mp));
+                INT_SET(arec->ar_blockcount, ARCH_CONVERT,
+                        agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
+                nfree += INT_GET(arec->ar_blockcount, ARCH_CONVERT);
+                error = xfs_bwrite(mp, bp);
+                if (error) {
+                        goto error0;
+                }
+                /*
+                 * INO btree root block
+                 */
+                bp = xfs_buf_get(mp->m_ddev_targp,
+                        XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+                        BTOBB(mp->m_sb.sb_blocksize), 0);
+                block = XFS_BUF_TO_SBLOCK(bp);
+                memset(block, 0, mp->m_sb.sb_blocksize);
+                INT_SET(block->bb_magic, ARCH_CONVERT, XFS_IBT_MAGIC);
+                block->bb_level = 0;
+                block->bb_numrecs = 0;
+                INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+                INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+                error = xfs_bwrite(mp, bp);
+                if (error) {
+                        goto error0;
+                }
+        }
+        xfs_trans_agblocks_delta(tp, nfree);
+        /*
+         * There are new blocks in the old last a.g.
+         */
+        if (new) {
+                /*
+                 * Change the agi length.
+                 */
+                error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+                if (error) {
+                        goto error0;
+                }
+                ASSERT(bp);
+                agi = XFS_BUF_TO_AGI(bp);
+                INT_MOD(agi->agi_length, ARCH_CONVERT, new);
+                ASSERT(nagcount == oagcount ||
+                       INT_GET(agi->agi_length, ARCH_CONVERT) ==
+                                mp->m_sb.sb_agblocks);
+                xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+                /*
+                 * Change agf length.
+                 */
+                error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+                if (error) {
+                        goto error0;
+                }
+                ASSERT(bp);
+                agf = XFS_BUF_TO_AGF(bp);
+                INT_MOD(agf->agf_length, ARCH_CONVERT, new);
+                ASSERT(INT_GET(agf->agf_length, ARCH_CONVERT) ==
+                                INT_GET(agi->agi_length, ARCH_CONVERT));
+                /*
+                 * Free the new space.
+                 */
+                error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
+                        INT_GET(agf->agf_length, ARCH_CONVERT) - new), new);
+                if (error) {
+                        goto error0;
+                }
+        }
+        if (nagcount > oagcount)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+        if (nb > mp->m_sb.sb_dblocks)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
+                                 nb - mp->m_sb.sb_dblocks);
+        if (nfree)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
+        if (dpct)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+        error = xfs_trans_commit(tp, 0, NULL);
+        if (error) {
+                return error;
+        }
+        /* New allocation groups fully initialized, so update mount struct */
+        if (nagimax)
+                mp->m_maxagi = nagimax;
+        if (mp->m_sb.sb_imax_pct) {
+                __uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+                do_div(icount, 100);
+                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+        } else
+                mp->m_maxicount = 0;
+        for (agno = 1; agno < nagcount; agno++) {
+                error = xfs_read_buf(mp, mp->m_ddev_targp,
+                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+                                  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                if (error) {
+                        xfs_fs_cmn_err(CE_WARN, mp,
+                        "error %d reading secondary superblock for ag %d",
+                                error, agno);
+                        break;
+                }
+                sbp = XFS_BUF_TO_SBP(bp);
+                xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS);
+                /*
+                 * If we get an error writing out the alternate superblocks,
+                 * just issue a warning and continue.  The real work is
+                 * already done and committed.
+                 */
+                if (!(error = xfs_bwrite(mp, bp))) {
+                        continue;
+                } else {
+                        xfs_fs_cmn_err(CE_WARN, mp,
+                "write error %d updating secondary superblock for ag %d",
+                                error, agno);
+                        break; /* no point in continuing */
+                }
+        }
+        return 0;
+ error0:
+        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+        return error;
+}
+static int
+xfs_growfs_log_private(
+        xfs_mount_t             *mp,    /* mount point for filesystem */
+        xfs_growfs_log_t        *in)    /* growfs log input struct */
+{
+        xfs_extlen_t            nb;
+        nb = in->newblocks;
+        if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+                return XFS_ERROR(EINVAL);
+        if (nb == mp->m_sb.sb_logblocks &&
+            in->isint == (mp->m_sb.sb_logstart != 0))
+                return XFS_ERROR(EINVAL);
+        /*
+         * Moving the log is hard, need new interfaces to sync
+         * the log first, hold off all activity while moving it.
+         * Can have shorter or longer log in the same space,
+         * or transform internal to external log or vice versa.
+         */
+        return XFS_ERROR(ENOSYS);
+}
+/*
+ * protected versions of growfs function acquire and release locks on the mount
+ * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
+ * XFS_IOC_FSGROWFSRT
+ */
+int
+xfs_growfs_data(
+        xfs_mount_t             *mp,
+        xfs_growfs_data_t       *in)
+{
+        int error;
+        if (!cpsema(&mp->m_growlock))
+                return XFS_ERROR(EWOULDBLOCK);
+        error = xfs_growfs_data_private(mp, in);
+        vsema(&mp->m_growlock);
+        return error;
+}
+int
+xfs_growfs_log(
+        xfs_mount_t             *mp,
+        xfs_growfs_log_t        *in)
+{
+        int error;
+        if (!cpsema(&mp->m_growlock))
+                return XFS_ERROR(EWOULDBLOCK);
+        error = xfs_growfs_log_private(mp, in);
+        vsema(&mp->m_growlock);
+        return error;
+}
+/*
+ * exported through ioctl XFS_IOC_FSCOUNTS
+ */
+int
+xfs_fs_counts(
+        xfs_mount_t             *mp,
+        xfs_fsop_counts_t       *cnt)
+{
+        unsigned long   s;
+        s = XFS_SB_LOCK(mp);
+        cnt->freedata = mp->m_sb.sb_fdblocks;
+        cnt->freertx = mp->m_sb.sb_frextents;
+        cnt->freeino = mp->m_sb.sb_ifree;
+        cnt->allocino = mp->m_sb.sb_icount;
+        XFS_SB_UNLOCK(mp, s);
+        return 0;
+}
+/*
+ * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
+ *
+ * xfs_reserve_blocks is called to set m_resblks
+ * in the in-core mount table. The number of unused reserved blocks
+ * is kept in m_resbls_avail.
+ *
+ * Reserve the requested number of blocks if available. Otherwise return
+ * as many as possible to satisfy the request. The actual number
+ * reserved are returned in outval
+ *
+ * A null inval pointer indicates that only the current reserved blocks
+ * available  should  be returned no settings are changed.
+ */
+int
+xfs_reserve_blocks(
+        xfs_mount_t             *mp,
+        __uint64_t              *inval,
+        xfs_fsop_resblks_t      *outval)
+{
+        __int64_t               lcounter, delta;
+        __uint64_t              request;
+        unsigned long           s;
+        /* If inval is null, report current values and return */
+        if (inval == (__uint64_t *)NULL) {
+                outval->resblks = mp->m_resblks;
+                outval->resblks_avail = mp->m_resblks_avail;
+                return(0);
+        }
+        request = *inval;
+        s = XFS_SB_LOCK(mp);
+        /*
+         * If our previous reservation was larger than the current value,
+         * then move any unused blocks back to the free pool.
+         */
+        if (mp->m_resblks > request) {
+                lcounter = mp->m_resblks_avail - request;
+                if (lcounter  > 0) {            /* release unused blocks */
+                        mp->m_sb.sb_fdblocks += lcounter;
+                        mp->m_resblks_avail -= lcounter;
+                }
+                mp->m_resblks = request;
+        } else {
+                delta = request - mp->m_resblks;
+                lcounter = mp->m_sb.sb_fdblocks - delta;
+                if (lcounter < 0) {
+                        /* We can't satisfy the request, just get what we can */
+                        mp->m_resblks += mp->m_sb.sb_fdblocks;
+                        mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
+                        mp->m_sb.sb_fdblocks = 0;
+                } else {
+                        mp->m_sb.sb_fdblocks = lcounter;
+                        mp->m_resblks = request;
+                        mp->m_resblks_avail += delta;
+                }
+        }
+        outval->resblks = mp->m_resblks;
+        outval->resblks_avail = mp->m_resblks_avail;
+        XFS_SB_UNLOCK(mp, s);
+        return(0);
+}
+void
+xfs_fs_log_dummy(xfs_mount_t *mp)
+{
+        xfs_trans_t *tp;
+        xfs_inode_t *ip;
+        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+        atomic_inc(&mp->m_active_trans);
+        if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+                xfs_trans_cancel(tp, 0);
+                return;
+        }
+        ip = mp->m_rootip;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_set_sync(tp);
+        xfs_trans_commit(tp, 0, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+int
+xfs_fs_goingdown(
+        xfs_mount_t     *mp,
+        __uint32_t      inflags)
+{
+        switch (inflags) {
+        case XFS_FSOP_GOING_FLAGS_DEFAULT: {
+                struct vfs *vfsp = XFS_MTOVFS(mp);
+                struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev);
+                if (sb) {
+                        xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+                        thaw_bdev(sb->s_bdev, sb);
+                }
+        
+                break;
+        }
+        case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
+                xfs_force_shutdown(mp, XFS_FORCE_UMOUNT);
+                break;
+        case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
+                xfs_force_shutdown(mp, XFS_FORCE_UMOUNT|XFS_LOG_IO_ERROR);
+                break;
+        default:
+                return XFS_ERROR(EINVAL);
+        }
+        return 0;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
new file mode 100644
index 000000000000..b61486173a61
--- /dev/null
+++ b/fs/xfs/xfs_fsops.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_FSOPS_H__
+#define __XFS_FSOPS_H__
+int
+xfs_fs_geometry(
+        xfs_mount_t             *mp,
+        xfs_fsop_geom_t         *geo,
+        int                     new_version);
+int
+xfs_growfs_data(
+        xfs_mount_t             *mp,
+        xfs_growfs_data_t       *in);
+int
+xfs_growfs_log(
+        xfs_mount_t             *mp,
+        xfs_growfs_log_t        *in);
+int
+xfs_fs_counts(
+        xfs_mount_t             *mp,
+        xfs_fsop_counts_t       *cnt);
+int
+xfs_reserve_blocks(
+        xfs_mount_t             *mp,
+        __uint64_t              *inval,
+        xfs_fsop_resblks_t      *outval);
+int
+xfs_fs_goingdown(
+        xfs_mount_t             *mp,
+        __uint32_t              inflags);
+#endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
new file mode 100644
index 000000000000..ce5fee9eaec5
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.c
@@ -0,0 +1,1401 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+/*
+ * Log specified fields for the inode given by bp and off.
+ */
+STATIC void
+xfs_ialloc_log_di(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_buf_t       *bp,            /* inode buffer */
+        int             off,            /* index of inode in buffer */
+        int             fields)         /* bitmask of fields to log */
+{
+        int                     first;          /* first byte number */
+        int                     ioffset;        /* off in bytes */
+        int                     last;           /* last byte number */
+        xfs_mount_t             *mp;            /* mount point structure */
+        static const short      offsets[] = {   /* field offsets */
+                                                /* keep in sync with bits */
+                offsetof(xfs_dinode_core_t, di_magic),
+                offsetof(xfs_dinode_core_t, di_mode),
+                offsetof(xfs_dinode_core_t, di_version),
+                offsetof(xfs_dinode_core_t, di_format),
+                offsetof(xfs_dinode_core_t, di_onlink),
+                offsetof(xfs_dinode_core_t, di_uid),
+                offsetof(xfs_dinode_core_t, di_gid),
+                offsetof(xfs_dinode_core_t, di_nlink),
+                offsetof(xfs_dinode_core_t, di_projid),
+                offsetof(xfs_dinode_core_t, di_pad),
+                offsetof(xfs_dinode_core_t, di_atime),
+                offsetof(xfs_dinode_core_t, di_mtime),
+                offsetof(xfs_dinode_core_t, di_ctime),
+                offsetof(xfs_dinode_core_t, di_size),
+                offsetof(xfs_dinode_core_t, di_nblocks),
+                offsetof(xfs_dinode_core_t, di_extsize),
+                offsetof(xfs_dinode_core_t, di_nextents),
+                offsetof(xfs_dinode_core_t, di_anextents),
+                offsetof(xfs_dinode_core_t, di_forkoff),
+                offsetof(xfs_dinode_core_t, di_aformat),
+                offsetof(xfs_dinode_core_t, di_dmevmask),
+                offsetof(xfs_dinode_core_t, di_dmstate),
+                offsetof(xfs_dinode_core_t, di_flags),
+                offsetof(xfs_dinode_core_t, di_gen),
+                offsetof(xfs_dinode_t, di_next_unlinked),
+                offsetof(xfs_dinode_t, di_u),
+                offsetof(xfs_dinode_t, di_a),
+                sizeof(xfs_dinode_t)
+        };
+        ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
+        ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
+        mp = tp->t_mountp;
+        /*
+         * Get the inode-relative first and last bytes for these fields
+         */
+        xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
+        /*
+         * Convert to buffer offsets and log it.
+         */
+        ioffset = off << mp->m_sb.sb_inodelog;
+        first += ioffset;
+        last += ioffset;
+        xfs_trans_log_buf(tp, bp, first, last);
+}
+/*
+ * Allocation group level functions.
+ */
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int                              /* error code or 0 */
+xfs_ialloc_ag_alloc(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_buf_t       *agbp,          /* alloc group buffer */
+        int             *alloc)
+{
+        xfs_agi_t       *agi;           /* allocation group header */
+        xfs_alloc_arg_t args;           /* allocation argument structure */
+        int             blks_per_cluster;  /* fs blocks per inode cluster */
+        xfs_btree_cur_t *cur;           /* inode btree cursor */
+        xfs_daddr_t     d;              /* disk addr of buffer */
+        int             error;
+        xfs_buf_t       *fbuf;          /* new free inodes' buffer */
+        xfs_dinode_t    *free;          /* new free inode structure */
+        int             i;              /* inode counter */
+        int             j;              /* block counter */
+        int             nbufs;          /* num bufs of new inodes */
+        xfs_agino_t     newino;         /* new first inode's number */
+        xfs_agino_t     newlen;         /* new number of inodes */
+        int             ninodes;        /* num inodes per buf */
+        xfs_agino_t     thisino;        /* current inode number, for loop */
+        int             version;        /* inode version number to use */
+        int             isaligned;      /* inode allocation at stripe unit */
+                                        /* boundary */
+        xfs_dinode_core_t dic;          /* a dinode_core to copy to new */
+                                        /* inodes */
+        args.tp = tp;
+        args.mp = tp->t_mountp;
+        /*
+         * Locking will ensure that we don't have two callers in here
+         * at one time.
+         */
+        newlen = XFS_IALLOC_INODES(args.mp);
+        if (args.mp->m_maxicount &&
+            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+                return XFS_ERROR(ENOSPC);
+        args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+        /*
+         * Set the alignment for the allocation.
+         * If stripe alignment is turned on then align at stripe unit
+         * boundary.
+         * If the cluster size is smaller than a filesystem block
+         * then we're doing I/O for inodes in filesystem block size pieces,
+         * so don't need alignment anyway.
+         */
+        isaligned = 0;
+        if (args.mp->m_sinoalign) {
+                ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+                args.alignment = args.mp->m_dalign;
+                isaligned = 1;
+        } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+            args.mp->m_sb.sb_inoalignmt >=
+            XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
+                args.alignment = args.mp->m_sb.sb_inoalignmt;
+        else
+                args.alignment = 1;
+        agi = XFS_BUF_TO_AGI(agbp);
+        /*
+         * Need to figure out where to allocate the inode blocks.
+         * Ideally they should be spaced out through the a.g.
+         * For now, just allocate blocks up front.
+         */
+        args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+                                    args.agbno);
+        /*
+         * Allocate a fixed-size extent of inodes.
+         */
+        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        args.mod = args.total = args.wasdel = args.isfl = args.userdata =
+                args.minalignslop = 0;
+        args.prod = 1;
+        /*
+         * Allow space for the inode btree to split.
+         */
+        args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+        if ((error = xfs_alloc_vextent(&args)))
+                return error;
+        /*
+         * If stripe alignment is turned on, then try again with cluster
+         * alignment.
+         */
+        if (isaligned && args.fsbno == NULLFSBLOCK) {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+                args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+                args.fsbno = XFS_AGB_TO_FSB(args.mp,
+                                INT_GET(agi->agi_seqno, ARCH_CONVERT), args.agbno);
+                if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+                        args.mp->m_sb.sb_inoalignmt >=
+                        XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
+                                args.alignment = args.mp->m_sb.sb_inoalignmt;
+                else
+                        args.alignment = 1;
+                if ((error = xfs_alloc_vextent(&args)))
+                        return error;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                *alloc = 0;
+                return 0;
+        }
+        ASSERT(args.len == args.minlen);
+        /*
+         * Convert the results.
+         */
+        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+        /*
+         * Loop over the new block(s), filling in the inodes.
+         * For small block sizes, manipulate the inodes in buffers
+         * which are multiples of the blocks size.
+         */
+        if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
+                blks_per_cluster = 1;
+                nbufs = (int)args.len;
+                ninodes = args.mp->m_sb.sb_inopblock;
+        } else {
+                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
+                                   args.mp->m_sb.sb_blocksize;
+                nbufs = (int)args.len / blks_per_cluster;
+                ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
+        }
+        /*
+         * Figure out what version number to use in the inodes we create.
+         * If the superblock version has caught up to the one that supports
+         * the new inode format, then use the new inode version.  Otherwise
+         * use the old version so that old kernels will continue to be
+         * able to use the file system.
+         */
+        if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
+                version = XFS_DINODE_VERSION_2;
+        else
+                version = XFS_DINODE_VERSION_1;
+        memset(&dic, 0, sizeof(xfs_dinode_core_t));
+        INT_SET(dic.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC);
+        INT_SET(dic.di_version, ARCH_CONVERT, version);
+        for (j = 0; j < nbufs; j++) {
+                /*
+                 * Get the block.
+                 */
+                d = XFS_AGB_TO_DADDR(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+                                     args.agbno + (j * blks_per_cluster));
+                fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
+                                         args.mp->m_bsize * blks_per_cluster,
+                                         XFS_BUF_LOCK);
+                ASSERT(fbuf);
+                ASSERT(!XFS_BUF_GETERROR(fbuf));
+                /*
+                 * Loop over the inodes in this buffer.
+                 */
+                for (i = 0; i < ninodes; i++) {
+                        free = XFS_MAKE_IPTR(args.mp, fbuf, i);
+                        memcpy(&(free->di_core), &dic, sizeof(xfs_dinode_core_t));
+                        INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+                        xfs_ialloc_log_di(tp, fbuf, i,
+                                XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
+                }
+                xfs_trans_inode_alloc_buf(tp, fbuf);
+        }
+        INT_MOD(agi->agi_count, ARCH_CONVERT, newlen);
+        INT_MOD(agi->agi_freecount, ARCH_CONVERT, newlen);
+        down_read(&args.mp->m_peraglock);
+        args.mp->m_perag[INT_GET(agi->agi_seqno, ARCH_CONVERT)].pagi_freecount += newlen;
+        up_read(&args.mp->m_peraglock);
+        INT_SET(agi->agi_newino, ARCH_CONVERT, newino);
+        /*
+         * Insert records describing the new inode chunk into the btree.
+         */
+        cur = xfs_btree_init_cursor(args.mp, tp, agbp,
+                        INT_GET(agi->agi_seqno, ARCH_CONVERT),
+                        XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+        for (thisino = newino;
+             thisino < newino + newlen;
+             thisino += XFS_INODES_PER_CHUNK) {
+                if ((error = xfs_inobt_lookup_eq(cur, thisino,
+                                XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                ASSERT(i == 0);
+                if ((error = xfs_inobt_insert(cur, &i))) {
+                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                ASSERT(i == 1);
+        }
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        /*
+         * Log allocation group header fields
+         */
+        xfs_ialloc_log_agi(tp, agbp,
+                XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+        /*
+         * Modify/log superblock values for inode count and inode free count.
+         */
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+        *alloc = 1;
+        return 0;
+}
+STATIC __inline xfs_agnumber_t
+xfs_ialloc_next_ag(
+        xfs_mount_t     *mp)
+{
+        xfs_agnumber_t  agno;
+        spin_lock(&mp->m_agirotor_lock);
+        agno = mp->m_agirotor;
+        if (++mp->m_agirotor == mp->m_maxagi)
+                mp->m_agirotor = 0;
+        spin_unlock(&mp->m_agirotor_lock);
+        return agno;
+}
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and then mode.  Return the allocation group buffer.
+ */
+STATIC xfs_buf_t *                      /* allocation group buffer */
+xfs_ialloc_ag_select(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_ino_t       parent,         /* parent directory inode number */
+        mode_t          mode,           /* bits set to indicate file type */
+        int             okalloc)        /* ok to allocate more space */
+{
+        xfs_buf_t       *agbp;          /* allocation group header buffer */
+        xfs_agnumber_t  agcount;        /* number of ag's in the filesystem */
+        xfs_agnumber_t  agno;           /* current ag number */
+        int             flags;          /* alloc buffer locking flags */
+        xfs_extlen_t    ineed;          /* blocks needed for inode allocation */
+        xfs_extlen_t    longest = 0;    /* longest extent available */
+        xfs_mount_t     *mp;            /* mount point structure */
+        int             needspace;      /* file mode implies space allocated */
+        xfs_perag_t     *pag;           /* per allocation group data */
+        xfs_agnumber_t  pagno;          /* parent (starting) ag number */
+        /*
+         * Files of these types need at least one block if length > 0
+         * (and they won't fit in the inode, but that's hard to figure out).
+         */
+        needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+        mp = tp->t_mountp;
+        agcount = mp->m_maxagi;
+        if (S_ISDIR(mode))
+                pagno = xfs_ialloc_next_ag(mp);
+        else {
+                pagno = XFS_INO_TO_AGNO(mp, parent);
+                if (pagno >= agcount)
+                        pagno = 0;
+        }
+        ASSERT(pagno < agcount);
+        /*
+         * Loop through allocation groups, looking for one with a little
+         * free space in it.  Note we don't look for free inodes, exactly.
+         * Instead, we include whether there is a need to allocate inodes
+         * to mean that blocks must be allocated for them,
+         * if none are currently free.
+         */
+        agno = pagno;
+        flags = XFS_ALLOC_FLAG_TRYLOCK;
+        down_read(&mp->m_peraglock);
+        for (;;) {
+                pag = &mp->m_perag[agno];
+                if (!pag->pagi_init) {
+                        if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+                                agbp = NULL;
+                                goto nextag;
+                        }
+                } else
+                        agbp = NULL;
+                if (!pag->pagi_inodeok) {
+                        xfs_ialloc_next_ag(mp);
+                        goto unlock_nextag;
+                }
+                /*
+                 * Is there enough free space for the file plus a block
+                 * of inodes (if we need to allocate some)?
+                 */
+                ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
+                if (ineed && !pag->pagf_init) {
+                        if (agbp == NULL &&
+                            xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+                                agbp = NULL;
+                                goto nextag;
+                        }
+                        (void)xfs_alloc_pagf_init(mp, tp, agno, flags);
+                }
+                if (!ineed || pag->pagf_init) {
+                        if (ineed && !(longest = pag->pagf_longest))
+                                longest = pag->pagf_flcount > 0;
+                        if (!ineed ||
+                            (pag->pagf_freeblks >= needspace + ineed &&
+                             longest >= ineed &&
+                             okalloc)) {
+                                if (agbp == NULL &&
+                                    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+                                        agbp = NULL;
+                                        goto nextag;
+                                }
+                                up_read(&mp->m_peraglock);
+                                return agbp;
+                        }
+                }
+unlock_nextag:
+                if (agbp)
+                        xfs_trans_brelse(tp, agbp);
+nextag:
+                /*
+                 * No point in iterating over the rest, if we're shutting
+                 * down.
+                 */
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        up_read(&mp->m_peraglock);
+                        return (xfs_buf_t *)0;
+                }
+                agno++;
+                if (agno >= agcount)
+                        agno = 0;
+                if (agno == pagno) {
+                        if (flags == 0) {
+                                up_read(&mp->m_peraglock);
+                                return (xfs_buf_t *)0;
+                        }
+                        flags = 0;
+                }
+        }
+}
+/*
+ * Visible inode allocation functions.
+ */
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * The arguments IO_agbp and alloc_done are defined to work within
+ * the constraint of one allocation per transaction.
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.  On the first call,
+ * IO_agbp should be set to NULL. If an inode is available,
+ * i.e., xfs_dialloc() did not need to do an allocation, an inode
+ * number is returned.  In this case, IO_agbp would be set to the
+ * current ag_buf and alloc_done set to false.
+ * If an allocation needed to be done, xfs_dialloc would return
+ * the current ag_buf in IO_agbp and set alloc_done to true.
+ * The caller should then commit the current transaction, allocate a new
+ * transaction, and call xfs_dialloc() again, passing in the previous
+ * value of IO_agbp.  IO_agbp should be held across the transactions.
+ * Since the agbp is locked across the two calls, the second call is
+ * guaranteed to have a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.  The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_ino_t       parent,         /* parent inode (directory) */
+        mode_t          mode,           /* mode bits for new inode */
+        int             okalloc,        /* ok to allocate more space */
+        xfs_buf_t       **IO_agbp,      /* in/out ag header's buffer */
+        boolean_t       *alloc_done,    /* true if we needed to replenish
+                                           inode freelist */
+        xfs_ino_t       *inop)          /* inode number allocated */
+{
+        xfs_agnumber_t  agcount;        /* number of allocation groups */
+        xfs_buf_t       *agbp;          /* allocation group header's buffer */
+        xfs_agnumber_t  agno;           /* allocation group number */
+        xfs_agi_t       *agi;           /* allocation group header structure */
+        xfs_btree_cur_t *cur;           /* inode allocation btree cursor */
+        int             error;          /* error return value */
+        int             i;              /* result code */
+        int             ialloced;       /* inode allocation status */
+        int             noroom = 0;     /* no space for inode blk allocation */
+        xfs_ino_t       ino;            /* fs-relative inode to be returned */
+        /* REFERENCED */
+        int             j;              /* result code */
+        xfs_mount_t     *mp;            /* file system mount structure */
+        int             offset;         /* index of inode in chunk */
+        xfs_agino_t     pagino;         /* parent's a.g. relative inode # */
+        xfs_agnumber_t  pagno;          /* parent's allocation group number */
+        xfs_inobt_rec_t rec;            /* inode allocation record */
+        xfs_agnumber_t  tagno;          /* testing allocation group number */
+        xfs_btree_cur_t *tcur;          /* temp cursor */
+        xfs_inobt_rec_t trec;           /* temp inode allocation record */
+        if (*IO_agbp == NULL) {
+                /*
+                 * We do not have an agbp, so select an initial allocation
+                 * group for inode allocation.
+                 */
+                agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+                /*
+                 * Couldn't find an allocation group satisfying the
+                 * criteria, give up.
+                 */
+                if (!agbp) {
+                        *inop = NULLFSINO;
+                        return 0;
+                }
+                agi = XFS_BUF_TO_AGI(agbp);
+                ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+        } else {
+                /*
+                 * Continue where we left off before.  In this case, we
+                 * know that the allocation group has free inodes.
+                 */
+                agbp = *IO_agbp;
+                agi = XFS_BUF_TO_AGI(agbp);
+                ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+                ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
+        }
+        mp = tp->t_mountp;
+        agcount = mp->m_sb.sb_agcount;
+        agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
+        tagno = agno;
+        pagno = XFS_INO_TO_AGNO(mp, parent);
+        pagino = XFS_INO_TO_AGINO(mp, parent);
+        /*
+         * If we have already hit the ceiling of inode blocks then clear
+         * okalloc so we scan all available agi structures for a free
+         * inode.
+         */
+        if (mp->m_maxicount &&
+            mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+                noroom = 1;
+                okalloc = 0;
+        }
+        /*
+         * Loop until we find an allocation group that either has free inodes
+         * or in which we can allocate some inodes.  Iterate through the
+         * allocation groups upward, wrapping at the end.
+         */
+        *alloc_done = B_FALSE;
+        while (!agi->agi_freecount) {
+                /*
+                 * Don't do anything if we're not supposed to allocate
+                 * any blocks, just go on to the next ag.
+                 */
+                if (okalloc) {
+                        /*
+                         * Try to allocate some new inodes in the allocation
+                         * group.
+                         */
+                        if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
+                                xfs_trans_brelse(tp, agbp);
+                                if (error == ENOSPC) {
+                                        *inop = NULLFSINO;
+                                        return 0;
+                                } else
+                                        return error;
+                        }
+                        if (ialloced) {
+                                /*
+                                 * We successfully allocated some inodes, return
+                                 * the current context to the caller so that it
+                                 * can commit the current transaction and call
+                                 * us again where we left off.
+                                 */
+                                ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
+                                *alloc_done = B_TRUE;
+                                *IO_agbp = agbp;
+                                *inop = NULLFSINO;
+                                return 0;
+                        }
+                }
+                /*
+                 * If it failed, give up on this ag.
+                 */
+                xfs_trans_brelse(tp, agbp);
+                /*
+                 * Go on to the next ag: get its ag header.
+                 */
+nextag:
+                if (++tagno == agcount)
+                        tagno = 0;
+                if (tagno == agno) {
+                        *inop = NULLFSINO;
+                        return noroom ? ENOSPC : 0;
+                }
+                down_read(&mp->m_peraglock);
+                if (mp->m_perag[tagno].pagi_inodeok == 0) {
+                        up_read(&mp->m_peraglock);
+                        goto nextag;
+                }
+                error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
+                up_read(&mp->m_peraglock);
+                if (error)
+                        goto nextag;
+                agi = XFS_BUF_TO_AGI(agbp);
+                ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+        }
+        /*
+         * Here with an allocation group that has a free inode.
+         * Reset agno since we may have chosen a new ag in the
+         * loop above.
+         */
+        agno = tagno;
+        *IO_agbp = NULL;
+        cur = xfs_btree_init_cursor(mp, tp, agbp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+                                    XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+        /*
+         * If pagino is 0 (this is the root inode allocation) use newino.
+         * This must work because we've just allocated some.
+         */
+        if (!pagino)
+                pagino = INT_GET(agi->agi_newino, ARCH_CONVERT);
+#ifdef DEBUG
+        if (cur->bc_nlevels == 1) {
+                int     freecount = 0;
+                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                do {
+                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+                                        &rec.ir_freecount, &rec.ir_free, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        freecount += rec.ir_freecount;
+                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                goto error0;
+                } while (i == 1);
+                ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+                       XFS_FORCED_SHUTDOWN(mp));
+        }
+#endif
+        /*
+         * If in the same a.g. as the parent, try to get near the parent.
+         */
+        if (pagno == agno) {
+                if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+                        goto error0;
+                if (i != 0 &&
+                    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+                            &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+                    j == 1 &&
+                    rec.ir_freecount > 0) {
+                        /*
+                         * Found a free inode in the same chunk
+                         * as parent, done.
+                         */
+                }
+                /*
+                 * In the same a.g. as parent, but parent's chunk is full.
+                 */
+                else {
+                        int     doneleft;       /* done, to the left */
+                        int     doneright;      /* done, to the right */
+                        if (error)
+                                goto error0;
+                        ASSERT(i == 1);
+                        ASSERT(j == 1);
+                        /*
+                         * Duplicate the cursor, search left & right
+                         * simultaneously.
+                         */
+                        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+                                goto error0;
+                        /*
+                         * Search left with tcur, back up 1 record.
+                         */
+                        if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+                                goto error1;
+                        doneleft = !i;
+                        if (!doneleft) {
+                                if ((error = xfs_inobt_get_rec(tcur,
+                                                &trec.ir_startino,
+                                                &trec.ir_freecount,
+                                                &trec.ir_free, &i)))
+                                        goto error1;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+                        }
+                        /*
+                         * Search right with cur, go forward 1 record.
+                         */
+                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                goto error1;
+                        doneright = !i;
+                        if (!doneright) {
+                                if ((error = xfs_inobt_get_rec(cur,
+                                                &rec.ir_startino,
+                                                &rec.ir_freecount,
+                                                &rec.ir_free, &i)))
+                                        goto error1;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+                        }
+                        /*
+                         * Loop until we find the closest inode chunk
+                         * with a free one.
+                         */
+                        while (!doneleft || !doneright) {
+                                int     useleft;  /* using left inode
+                                                     chunk this time */
+                                /*
+                                 * Figure out which block is closer,
+                                 * if both are valid.
+                                 */
+                                if (!doneleft && !doneright)
+                                        useleft =
+                                                pagino -
+                                                (trec.ir_startino +
+                                                 XFS_INODES_PER_CHUNK - 1) <
+                                                 rec.ir_startino - pagino;
+                                else
+                                        useleft = !doneleft;
+                                /*
+                                 * If checking the left, does it have
+                                 * free inodes?
+                                 */
+                                if (useleft && trec.ir_freecount) {
+                                        /*
+                                         * Yes, set it up as the chunk to use.
+                                         */
+                                        rec = trec;
+                                        xfs_btree_del_cursor(cur,
+                                                XFS_BTREE_NOERROR);
+                                        cur = tcur;
+                                        break;
+                                }
+                                /*
+                                 * If checking the right, does it have
+                                 * free inodes?
+                                 */
+                                if (!useleft && rec.ir_freecount) {
+                                        /*
+                                         * Yes, it's already set up.
+                                         */
+                                        xfs_btree_del_cursor(tcur,
+                                                XFS_BTREE_NOERROR);
+                                        break;
+                                }
+                                /*
+                                 * If used the left, get another one
+                                 * further left.
+                                 */
+                                if (useleft) {
+                                        if ((error = xfs_inobt_decrement(tcur, 0,
+                                                        &i)))
+                                                goto error1;
+                                        doneleft = !i;
+                                        if (!doneleft) {
+                                                if ((error = xfs_inobt_get_rec(
+                                                            tcur,
+                                                            &trec.ir_startino,
+                                                            &trec.ir_freecount,
+                                                            &trec.ir_free, &i)))
+                                                        goto error1;
+                                                XFS_WANT_CORRUPTED_GOTO(i == 1,
+                                                        error1);
+                                        }
+                                }
+                                /*
+                                 * If used the right, get another one
+                                 * further right.
+                                 */
+                                else {
+                                        if ((error = xfs_inobt_increment(cur, 0,
+                                                        &i)))
+                                                goto error1;
+                                        doneright = !i;
+                                        if (!doneright) {
+                                                if ((error = xfs_inobt_get_rec(
+                                                            cur,
+                                                            &rec.ir_startino,
+                                                            &rec.ir_freecount,
+                                                            &rec.ir_free, &i)))
+                                                        goto error1;
+                                                XFS_WANT_CORRUPTED_GOTO(i == 1,
+                                                        error1);
+                                        }
+                                }
+                        }
+                        ASSERT(!doneleft || !doneright);
+                }
+        }
+        /*
+         * In a different a.g. from the parent.
+         * See if the most recently allocated block has any free.
+         */
+        else if (INT_GET(agi->agi_newino, ARCH_CONVERT) != NULLAGINO) {
+                if ((error = xfs_inobt_lookup_eq(cur,
+                                INT_GET(agi->agi_newino, ARCH_CONVERT), 0, 0, &i)))
+                        goto error0;
+                if (i == 1 &&
+                    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+                            &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+                    j == 1 &&
+                    rec.ir_freecount > 0) {
+                        /*
+                         * The last chunk allocated in the group still has
+                         * a free inode.
+                         */
+                }
+                /*
+                 * None left in the last group, search the whole a.g.
+                 */
+                else {
+                        if (error)
+                                goto error0;
+                        if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+                                goto error0;
+                        ASSERT(i == 1);
+                        for (;;) {
+                                if ((error = xfs_inobt_get_rec(cur,
+                                                &rec.ir_startino,
+                                                &rec.ir_freecount, &rec.ir_free,
+                                                &i)))
+                                        goto error0;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                                if (rec.ir_freecount > 0)
+                                        break;
+                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                        goto error0;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        }
+                }
+        }
+        offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+        ASSERT(offset >= 0);
+        ASSERT(offset < XFS_INODES_PER_CHUNK);
+        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+                                   XFS_INODES_PER_CHUNK) == 0);
+        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+        XFS_INOBT_CLR_FREE(&rec, offset);
+        rec.ir_freecount--;
+        if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
+                        rec.ir_free)))
+                goto error0;
+        INT_MOD(agi->agi_freecount, ARCH_CONVERT, -1);
+        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+        down_read(&mp->m_peraglock);
+        mp->m_perag[tagno].pagi_freecount--;
+        up_read(&mp->m_peraglock);
+#ifdef DEBUG
+        if (cur->bc_nlevels == 1) {
+                int     freecount = 0;
+                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+                        goto error0;
+                do {
+                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+                                        &rec.ir_freecount, &rec.ir_free, &i)))
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        freecount += rec.ir_freecount;
+                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                goto error0;
+                } while (i == 1);
+                ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+                       XFS_FORCED_SHUTDOWN(mp));
+        }
+#endif
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+        *inop = ino;
+        return 0;
+error1:
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_ino_t       inode,          /* inode to be freed */
+        xfs_bmap_free_t *flist,         /* extents to free */
+        int             *delete,        /* set if inode cluster was deleted */
+        xfs_ino_t       *first_ino)     /* first inode in deleted cluster */
+{
+        /* REFERENCED */
+        xfs_agblock_t   agbno;  /* block number containing inode */
+        xfs_buf_t       *agbp;  /* buffer containing allocation group header */
+        xfs_agino_t     agino;  /* inode number relative to allocation group */
+        xfs_agnumber_t  agno;   /* allocation group number */
+        xfs_agi_t       *agi;   /* allocation group header */
+        xfs_btree_cur_t *cur;   /* inode btree cursor */
+        int             error;  /* error return value */
+        int             i;      /* result code */
+        int             ilen;   /* inodes in an inode cluster */
+        xfs_mount_t     *mp;    /* mount structure for filesystem */
+        int             off;    /* offset of inode in inode chunk */
+        xfs_inobt_rec_t rec;    /* btree record */
+        mp = tp->t_mountp;
+        /*
+         * Break up inode number into its components.
+         */
+        agno = XFS_INO_TO_AGNO(mp, inode);
+        if (agno >= mp->m_sb.sb_agcount)  {
+                cmn_err(CE_WARN,
+                        "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
+                        agno, mp->m_sb.sb_agcount, mp->m_fsname);
+                ASSERT(0);
+                return XFS_ERROR(EINVAL);
+        }
+        agino = XFS_INO_TO_AGINO(mp, inode);
+        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
+                cmn_err(CE_WARN,
+                        "xfs_difree: inode != XFS_AGINO_TO_INO() (%d != %d) on %s.  Returning EINVAL.",
+                        inode, XFS_AGINO_TO_INO(mp, agno, agino), mp->m_fsname);
+                ASSERT(0);
+                return XFS_ERROR(EINVAL);
+        }
+        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+        if (agbno >= mp->m_sb.sb_agblocks)  {
+                cmn_err(CE_WARN,
+                        "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
+                        agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
+                ASSERT(0);
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Get the allocation group header.
+         */
+        down_read(&mp->m_peraglock);
+        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+        up_read(&mp->m_peraglock);
+        if (error) {
+                cmn_err(CE_WARN,
+                        "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
+                        error, mp->m_fsname);
+                return error;
+        }
+        agi = XFS_BUF_TO_AGI(agbp);
+        ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+        ASSERT(agbno < INT_GET(agi->agi_length, ARCH_CONVERT));
+        /*
+         * Initialize the cursor.
+         */
+        cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+                (xfs_inode_t *)0, 0);
+#ifdef DEBUG
+        if (cur->bc_nlevels == 1) {
+                int freecount = 0;
+                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+                        goto error0;
+                do {
+                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+                                        &rec.ir_freecount, &rec.ir_free, &i)))
+                                goto error0;
+                        if (i) {
+                                freecount += rec.ir_freecount;
+                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                        goto error0;
+                        }
+                } while (i == 1);
+                ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+                       XFS_FORCED_SHUTDOWN(mp));
+        }
+#endif
+        /*
+         * Look for the entry describing this inode.
+         */
+        if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+                cmn_err(CE_WARN,
+                        "xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.  Returning error.",
+                        error, mp->m_fsname);
+                goto error0;
+        }
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
+                        &rec.ir_free, &i))) {
+                cmn_err(CE_WARN,
+                        "xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
+                        error, mp->m_fsname);
+                goto error0;
+        }
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        /*
+         * Get the offset in the inode chunk.
+         */
+        off = agino - rec.ir_startino;
+        ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+        ASSERT(!XFS_INOBT_IS_FREE(&rec, off));
+        /*
+         * Mark the inode free & increment the count.
+         */
+        XFS_INOBT_SET_FREE(&rec, off);
+        rec.ir_freecount++;
+        /*
+         * When an inode cluster is free, it becomes elgible for removal
+         */
+        if ((mp->m_flags & XFS_MOUNT_IDELETE) &&
+            (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+                *delete = 1;
+                *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
+                /*
+                 * Remove the inode cluster from the AGI B+Tree, adjust the
+                 * AGI and Superblock inode counts, and mark the disk space
+                 * to be freed when the transaction is committed.
+                 */
+                ilen = XFS_IALLOC_INODES(mp);
+                INT_MOD(agi->agi_count, ARCH_CONVERT, -ilen);
+                INT_MOD(agi->agi_freecount, ARCH_CONVERT, -(ilen - 1));
+                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+                down_read(&mp->m_peraglock);
+                mp->m_perag[agno].pagi_freecount -= ilen - 1;
+                up_read(&mp->m_peraglock);
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
+                if ((error = xfs_inobt_delete(cur, &i))) {
+                        cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
+                                error, mp->m_fsname);
+                        goto error0;
+                }
+                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
+                                agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
+                                XFS_IALLOC_BLOCKS(mp), flist, mp);
+        } else {
+                *delete = 0;
+                if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+                        cmn_err(CE_WARN,
+                                "xfs_difree: xfs_inobt_update()  returned an error %d on %s.  Returning error.",
+                                error, mp->m_fsname);
+                        goto error0;
+                }
+                /* 
+                 * Change the inode free counts and log the ag/sb changes.
+                 */
+                INT_MOD(agi->agi_freecount, ARCH_CONVERT, 1);
+                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+                down_read(&mp->m_peraglock);
+                mp->m_perag[agno].pagi_freecount++;
+                up_read(&mp->m_peraglock);
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+        }
+#ifdef DEBUG
+        if (cur->bc_nlevels == 1) {
+                int freecount = 0;
+                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+                        goto error0;
+                do {
+                        if ((error = xfs_inobt_get_rec(cur,
+                                        &rec.ir_startino,
+                                        &rec.ir_freecount,
+                                        &rec.ir_free, &i)))
+                                goto error0;
+                        if (i) {
+                                freecount += rec.ir_freecount;
+                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                        goto error0;
+                        }
+                } while (i == 1);
+                ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+                       XFS_FORCED_SHUTDOWN(mp));
+        }
+#endif
+        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+        return 0;
+error0:
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Return the location of the inode in bno/off, for mapping it into a buffer.
+ */
+/*ARGSUSED*/
+int
+xfs_dilocate(
+        xfs_mount_t     *mp,    /* file system mount structure */
+        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_ino_t       ino,    /* inode to locate */
+        xfs_fsblock_t   *bno,   /* output: block containing inode */
+        int             *len,   /* output: num blocks in inode cluster */
+        int             *off,   /* output: index in block of inode */
+        uint            flags)  /* flags concerning inode lookup */
+{
+        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
+        xfs_buf_t       *agbp;  /* agi buffer */
+        xfs_agino_t     agino;  /* inode number within alloc group */
+        xfs_agnumber_t  agno;   /* allocation group number */
+        int             blks_per_cluster; /* num blocks per inode cluster */
+        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
+        xfs_agino_t     chunk_agino;    /* first agino in inode chunk */
+        __int32_t       chunk_cnt;      /* count of free inodes in chunk */
+        xfs_inofree_t   chunk_free;     /* mask of free inodes in chunk */
+        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
+        xfs_btree_cur_t *cur;   /* inode btree cursor */
+        int             error;  /* error code */
+        int             i;      /* temp state */
+        int             offset; /* index of inode in its buffer */
+        int             offset_agbno;   /* blks from chunk start to inode */
+        ASSERT(ino != NULLFSINO);
+        /*
+         * Split up the inode number into its parts.
+         */
+        agno = XFS_INO_TO_AGNO(mp, ino);
+        agino = XFS_INO_TO_AGINO(mp, ino);
+        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+        if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+                if (agno >= mp->m_sb.sb_agcount) {
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                        "xfs_dilocate: agno (%d) >= "
+                                        "mp->m_sb.sb_agcount (%d)",
+                                        agno,  mp->m_sb.sb_agcount);
+                }
+                if (agbno >= mp->m_sb.sb_agblocks) {
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                        "xfs_dilocate: agbno (0x%llx) >= "
+                                        "mp->m_sb.sb_agblocks (0x%lx)",
+                                        (unsigned long long) agbno,
+                                        (unsigned long) mp->m_sb.sb_agblocks);
+                }
+                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                        "xfs_dilocate: ino (0x%llx) != "
+                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
+                                        "(0x%llx)",
+                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
+                }
+#endif /* DEBUG */
+                return XFS_ERROR(EINVAL);
+        }
+        if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
+            !(flags & XFS_IMAP_LOOKUP)) {
+                offset = XFS_INO_TO_OFFSET(mp, ino);
+                ASSERT(offset < mp->m_sb.sb_inopblock);
+                *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
+                *off = offset;
+                *len = 1;
+                return 0;
+        }
+        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+        if (*bno != NULLFSBLOCK) {
+                offset = XFS_INO_TO_OFFSET(mp, ino);
+                ASSERT(offset < mp->m_sb.sb_inopblock);
+                cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
+                *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+                        offset;
+                *len = blks_per_cluster;
+                return 0;
+        }
+        if (mp->m_inoalign_mask) {
+                offset_agbno = agbno & mp->m_inoalign_mask;
+                chunk_agbno = agbno - offset_agbno;
+        } else {
+                down_read(&mp->m_peraglock);
+                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+                up_read(&mp->m_peraglock);
+                if (error) {
+#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                                        "xfs_ialloc_read_agi() returned "
+                                        "error %d, agno %d",
+                                        error, agno);
+#endif /* DEBUG */
+                        return error;
+                }
+                cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+                        (xfs_inode_t *)0, 0);
+                if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                                        "xfs_inobt_lookup_le() failed");
+#endif /* DEBUG */
+                        goto error0;
+                }
+                if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
+                                &chunk_free, &i))) {
+#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                                        "xfs_inobt_get_rec() failed");
+#endif /* DEBUG */
+                        goto error0;
+                }
+                if (i == 0) {
+#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                                        "xfs_inobt_get_rec() failed");
+#endif /* DEBUG */
+                        error = XFS_ERROR(EINVAL);
+                }
+                xfs_trans_brelse(tp, agbp);
+                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                if (error)
+                        return error;
+                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+                offset_agbno = agbno - chunk_agbno;
+        }
+        ASSERT(agbno >= chunk_agbno);
+        cluster_agbno = chunk_agbno +
+                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
+        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+                XFS_INO_TO_OFFSET(mp, ino);
+        *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
+        *off = offset;
+        *len = blks_per_cluster;
+        return 0;
+error0:
+        xfs_trans_brelse(tp, agbp);
+        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+        xfs_mount_t     *mp)            /* file system mount structure */
+{
+        int             level;
+        uint            maxblocks;
+        uint            maxleafents;
+        int             minleafrecs;
+        int             minnoderecs;
+        maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+                XFS_INODES_PER_CHUNK_LOG;
+        minleafrecs = mp->m_alloc_mnr[0];
+        minnoderecs = mp->m_alloc_mnr[1];
+        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+        for (level = 1; maxblocks > 1; level++)
+                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+        mp->m_in_maxlevels = level;
+}
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_buf_t       *bp,            /* allocation group header buffer */
+        int             fields)         /* bitmask of fields to log */
+{
+        int                     first;          /* first byte number */
+        int                     last;           /* last byte number */
+        static const short      offsets[] = {   /* field starting offsets */
+                                        /* keep in sync with bit definitions */
+                offsetof(xfs_agi_t, agi_magicnum),
+                offsetof(xfs_agi_t, agi_versionnum),
+                offsetof(xfs_agi_t, agi_seqno),
+                offsetof(xfs_agi_t, agi_length),
+                offsetof(xfs_agi_t, agi_count),
+                offsetof(xfs_agi_t, agi_root),
+                offsetof(xfs_agi_t, agi_level),
+                offsetof(xfs_agi_t, agi_freecount),
+                offsetof(xfs_agi_t, agi_newino),
+                offsetof(xfs_agi_t, agi_dirino),
+                offsetof(xfs_agi_t, agi_unlinked),
+                sizeof(xfs_agi_t)
+        };
+#ifdef DEBUG
+        xfs_agi_t               *agi;   /* allocation group header */
+        agi = XFS_BUF_TO_AGI(bp);
+        ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+#endif
+        /*
+         * Compute byte offsets for the first and last fields.
+         */
+        xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+        /*
+         * Log the allocation group inode header buffer.
+         */
+        xfs_trans_log_buf(tp, bp, first, last);
+}
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_ialloc_read_agi(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_buf_t       **bpp)          /* allocation group hdr buf */
+{
+        xfs_agi_t       *agi;           /* allocation group header */
+        int             agi_ok;         /* agi is consistent */
+        xfs_buf_t       *bp;            /* allocation group hdr buf */
+        xfs_perag_t     *pag;           /* per allocation group data */
+        int             error;
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_trans_read_buf(
+                        mp, tp, mp->m_ddev_targp,
+                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+        if (error)
+                return error;
+        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+        /*
+         * Validate the magic number of the agi block.
+         */
+        agi = XFS_BUF_TO_AGI(bp);
+        agi_ok =
+                INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+                XFS_AGI_GOOD_VERSION(
+                        INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+                        XFS_RANDOM_IALLOC_READ_AGI))) {
+                XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
+                                     mp, agi);
+                xfs_trans_brelse(tp, bp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        pag = &mp->m_perag[agno];
+        if (!pag->pagi_init) {
+                pag->pagi_freecount = INT_GET(agi->agi_freecount, ARCH_CONVERT);
+                pag->pagi_init = 1;
+        } else {
+                /*
+                 * It's possible for these to be out of sync if
+                 * we are in the middle of a forced shutdown.
+                 */
+                ASSERT(pag->pagi_freecount ==
+                                INT_GET(agi->agi_freecount, ARCH_CONVERT)
+                        || XFS_FORCED_SHUTDOWN(mp));
+        }
+#ifdef DEBUG
+        {
+                int     i;
+                for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+                        ASSERT(agi->agi_unlinked[i]);
+        }
+#endif
+        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
+        *bpp = bp;
+        return 0;
+}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
new file mode 100644
index 000000000000..db6d0015cecf
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IALLOC_H__
+#define __XFS_IALLOC_H__
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * Allocation parameters for inode allocation.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_INODES)
+int xfs_ialloc_inodes(struct xfs_mount *mp);
+#define XFS_IALLOC_INODES(mp)   xfs_ialloc_inodes(mp)
+#else
+#define XFS_IALLOC_INODES(mp)   ((mp)->m_ialloc_inos)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_BLOCKS)
+xfs_extlen_t xfs_ialloc_blocks(struct xfs_mount *mp);
+#define XFS_IALLOC_BLOCKS(mp)   xfs_ialloc_blocks(mp)
+#else
+#define XFS_IALLOC_BLOCKS(mp)   ((mp)->m_ialloc_blks)
+#endif
+/*
+ * For small block file systems, move inodes in clusters of this size.
+ * When we don't have a lot of memory, however, we go a bit smaller
+ * to reduce the number of AGI and ialloc btree blocks we need to keep
+ * around for xfs_dilocate().  We choose which one to use in
+ * xfs_mount_int().
+ */
+#define XFS_INODE_BIG_CLUSTER_SIZE      8192
+#define XFS_INODE_SMALL_CLUSTER_SIZE    4096
+#define XFS_INODE_CLUSTER_SIZE(mp)      (mp)->m_inode_cluster_size
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MAKE_IPTR)
+struct xfs_dinode *xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o);
+#define XFS_MAKE_IPTR(mp,b,o)           xfs_make_iptr(mp,b,o)
+#else
+#define XFS_MAKE_IPTR(mp,b,o) \
+        ((xfs_dinode_t *)(xfs_buf_offset(b, (o) << (mp)->m_sb.sb_inodelog)))
+#endif
+/*
+ * Find a free (set) bit in the inode bitmask.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_FIND_FREE)
+int xfs_ialloc_find_free(xfs_inofree_t *fp);
+#define XFS_IALLOC_FIND_FREE(fp)        xfs_ialloc_find_free(fp)
+#else
+#define XFS_IALLOC_FIND_FREE(fp)        xfs_lowbit64(*(fp))
+#endif
+#ifdef __KERNEL__
+/*
+ * Prototypes for visible xfs_ialloc.c routines.
+ */
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.  If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction.  xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.  The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int                                     /* error */
+xfs_dialloc(
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_ino_t       parent,         /* parent inode (directory) */
+        mode_t          mode,           /* mode bits for new inode */
+        int             okalloc,        /* ok to allocate more space */
+        struct xfs_buf  **agbp,         /* buf for a.g. inode header */
+        boolean_t       *alloc_done,    /* an allocation was done to replenish
+                                           the free inodes */
+        xfs_ino_t       *inop);         /* inode number allocated */
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int                                     /* error */
+xfs_difree(
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_ino_t       inode,          /* inode to be freed */
+        struct xfs_bmap_free *flist,    /* extents to free */
+        int             *delete,        /* set if inode cluster was deleted */
+        xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
+/*
+ * Return the location of the inode in bno/len/off,
+ * for mapping it into a buffer.
+ */
+int
+xfs_dilocate(
+        struct xfs_mount *mp,           /* file system mount structure */
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_ino_t       ino,            /* inode to locate */
+        xfs_fsblock_t   *bno,           /* output: block containing inode */
+        int             *len,           /* output: num blocks in cluster*/
+        int             *off,           /* output: index in block of inode */
+        uint            flags);         /* flags for inode btree lookup */
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+        struct xfs_mount *mp);          /* file system mount structure */
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+        struct xfs_trans *tp,           /* transaction pointer */
+        struct xfs_buf  *bp,            /* allocation group header buffer */
+        int             fields);        /* bitmask of fields to log */
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int                                     /* error */
+xfs_ialloc_read_agi(
+        struct xfs_mount *mp,           /* file system mount structure */
+        struct xfs_trans *tp,           /* transaction pointer */
+        xfs_agnumber_t  agno,           /* allocation group number */
+        struct xfs_buf  **bpp);         /* allocation group hdr buf */
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000000..2d4daecec990
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -0,0 +1,2094 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+/*
+ * Inode allocation management for XFS.
+ */
+/*
+ * Prototypes for internal functions.
+ */
+STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
+STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+                xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
+/*
+ * Internal functions.
+ */
+/*
+ * Single level of the xfs_inobt_delete record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                              /* error */
+xfs_inobt_delrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level removing record from */
+        int                     *stat)  /* fail/done/go-on */
+{
+        xfs_buf_t               *agbp;  /* buffer for a.g. inode header */
+        xfs_mount_t             *mp;    /* mount structure */
+        xfs_agi_t               *agi;   /* allocation group inode header */
+        xfs_inobt_block_t       *block; /* btree block record/key lives in */
+        xfs_agblock_t           bno;    /* btree block number */
+        xfs_buf_t               *bp;    /* buffer for block */
+        int                     error;  /* error return value */
+        int                     i;      /* loop index */
+        xfs_inobt_key_t         key;    /* kp points here if block is level 0 */
+        xfs_inobt_key_t         *kp = NULL;     /* pointer to btree keys */
+        xfs_agblock_t           lbno;   /* left block's block number */
+        xfs_buf_t               *lbp;   /* left block's buffer pointer */
+        xfs_inobt_block_t       *left;  /* left btree block */
+        xfs_inobt_key_t         *lkp;   /* left block key pointer */
+        xfs_inobt_ptr_t         *lpp;   /* left block address pointer */
+        int                     lrecs = 0;      /* number of records in left block */
+        xfs_inobt_rec_t         *lrp;   /* left block record pointer */
+        xfs_inobt_ptr_t         *pp = NULL;     /* pointer to btree addresses */
+        int                     ptr;    /* index in btree block for this rec */
+        xfs_agblock_t           rbno;   /* right block's block number */
+        xfs_buf_t               *rbp;   /* right block's buffer pointer */
+        xfs_inobt_block_t       *right; /* right btree block */
+        xfs_inobt_key_t         *rkp;   /* right block key pointer */
+        xfs_inobt_rec_t         *rp;    /* pointer to btree records */
+        xfs_inobt_ptr_t         *rpp;   /* right block address pointer */
+        int                     rrecs = 0;      /* number of records in right block */
+        int                     numrecs;
+        xfs_inobt_rec_t         *rrp;   /* right block record pointer */
+        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
+        mp = cur->bc_mp;
+        /*
+         * Get the index of the entry being deleted, check for nothing there.
+         */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Get the buffer & block containing the record or key/ptr.
+         */
+        bp = cur->bc_bufs[level];
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                return error;
+#endif
+        /*
+         * Fail if we're off the end of the block.
+         */
+        numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+        if (ptr > numrecs) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * It's a nonleaf.  Excise the key and ptr being deleted, by
+         * sliding the entries past them down one.
+         * Log the changed areas of the block.
+         */
+        if (level > 0) {
+                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+                for (i = ptr; i < numrecs; i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                if (ptr < numrecs) {
+                        memmove(&kp[ptr - 1], &kp[ptr],
+                                (numrecs - ptr) * sizeof(*kp));
+                        memmove(&pp[ptr - 1], &pp[ptr],
+                                (numrecs - ptr) * sizeof(*kp));
+                        xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
+                        xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
+                }
+        }
+        /*
+         * It's a leaf.  Excise the record being deleted, by sliding the
+         * entries past it down one.  Log the changed areas of the block.
+         */
+        else {
+                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+                if (ptr < numrecs) {
+                        memmove(&rp[ptr - 1], &rp[ptr],
+                                (numrecs - ptr) * sizeof(*rp));
+                        xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                if (ptr == 1) {
+                        key.ir_startino = rp->ir_startino;
+                        kp = &key;
+                }
+        }
+        /*
+         * Decrement and log the number of entries in the block.
+         */
+        numrecs--;
+        INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+        /*
+         * Is this the root level?  If so, we're almost done.
+         */
+        if (level == cur->bc_nlevels - 1) {
+                /*
+                 * If this is the root level,
+                 * and there's only one entry left,
+                 * and it's NOT the leaf level,
+                 * then we can get rid of this level.
+                 */
+                if (numrecs == 1 && level > 0) {
+                        agbp = cur->bc_private.i.agbp;
+                        agi = XFS_BUF_TO_AGI(agbp);
+                        /*
+                         * pp is still set to the first pointer in the block.
+                         * Make it the new root of the btree.
+                         */
+                        bno = INT_GET(agi->agi_root, ARCH_CONVERT);
+                        agi->agi_root = *pp;
+                        INT_MOD(agi->agi_level, ARCH_CONVERT, -1);
+                        /*
+                         * Free the block.
+                         */
+                        if ((error = xfs_free_extent(cur->bc_tp,
+                                XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
+                                return error;
+                        xfs_trans_binval(cur->bc_tp, bp);
+                        xfs_ialloc_log_agi(cur->bc_tp, agbp,
+                                XFS_AGI_ROOT | XFS_AGI_LEVEL);
+                        /*
+                         * Update the cursor so there's one fewer level.
+                         */
+                        cur->bc_bufs[level] = NULL;
+                        cur->bc_nlevels--;
+                } else if (level > 0 &&
+                           (error = xfs_inobt_decrement(cur, level, &i)))
+                        return error;
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we deleted the leftmost entry in the block, update the
+         * key values above us in the tree.
+         */
+        if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
+                return error;
+        /*
+         * If the number of records remaining in the block is at least
+         * the minimum, we're done.
+         */
+        if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+                if (level > 0 &&
+                    (error = xfs_inobt_decrement(cur, level, &i)))
+                        return error;
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * Otherwise, we have to move some records around to keep the
+         * tree balanced.  Look at the left and right sibling blocks to
+         * see if we can re-balance by moving only one record.
+         */
+        rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+        lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+        bno = NULLAGBLOCK;
+        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
+        /*
+         * Duplicate the cursor so our btree manipulations here won't
+         * disrupt the next level up.
+         */
+        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+                return error;
+        /*
+         * If there's a right sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (rbno != NULLAGBLOCK) {
+                /*
+                 * Move the temp cursor to the last entry in the next block.
+                 * Actually any entry but the first would suffice.
+                 */
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if ((error = xfs_inobt_increment(tcur, level, &i)))
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /*
+                 * Grab a pointer to the block.
+                 */
+                rbp = tcur->bc_bufs[level];
+                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                        goto error0;
+#endif
+                /*
+                 * Grab the current block number, for future use.
+                 */
+                bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+                /*
+                 * If right block is full enough so that removing one entry
+                 * won't make it too empty, and left-shifting an entry out
+                 * of right to us works, we're done.
+                 */
+                if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+                        if ((error = xfs_inobt_lshift(tcur, level, &i)))
+                                goto error0;
+                        if (i) {
+                                ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
+                                xfs_btree_del_cursor(tcur,
+                                                     XFS_BTREE_NOERROR);
+                                if (level > 0 &&
+                                    (error = xfs_inobt_decrement(cur, level,
+                                                &i)))
+                                        return error;
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference, and fix up the temp cursor to point
+                 * to our block again (last record).
+                 */
+                rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+                if (lbno != NULLAGBLOCK) {
+                        xfs_btree_firstrec(tcur, level);
+                        if ((error = xfs_inobt_decrement(tcur, level, &i)))
+                                goto error0;
+                }
+        }
+        /*
+         * If there's a left sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (lbno != NULLAGBLOCK) {
+                /*
+                 * Move the temp cursor to the first entry in the
+                 * previous block.
+                 */
+                xfs_btree_firstrec(tcur, level);
+                if ((error = xfs_inobt_decrement(tcur, level, &i)))
+                        goto error0;
+                xfs_btree_firstrec(tcur, level);
+                /*
+                 * Grab a pointer to the block.
+                 */
+                lbp = tcur->bc_bufs[level];
+                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                        goto error0;
+#endif
+                /*
+                 * Grab the current block number, for future use.
+                 */
+                bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+                /*
+                 * If left block is full enough so that removing one entry
+                 * won't make it too empty, and right-shifting an entry out
+                 * of left to us works, we're done.
+                 */
+                if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+                        if ((error = xfs_inobt_rshift(tcur, level, &i)))
+                                goto error0;
+                        if (i) {
+                                ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
+                                xfs_btree_del_cursor(tcur,
+                                                     XFS_BTREE_NOERROR);
+                                if (level == 0)
+                                        cur->bc_ptrs[0]++;
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference.
+                 */
+                lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        }
+        /*
+         * Delete the temp cursor, we're done with it.
+         */
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        /*
+         * If here, we need to do a join to keep the tree balanced.
+         */
+        ASSERT(bno != NULLAGBLOCK);
+        /*
+         * See if we can join with the left neighbor block.
+         */
+        if (lbno != NULLAGBLOCK &&
+            lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+                /*
+                 * Set "right" to be the starting block,
+                 * "left" to be the left neighbor.
+                 */
+                rbno = bno;
+                right = block;
+                rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+                rbp = bp;
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.i.agno, lbno, 0, &lbp,
+                                XFS_INO_BTREE_REF)))
+                        return error;
+                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+                lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                        return error;
+        }
+        /*
+         * If that won't work, see if we can join with the right neighbor block.
+         */
+        else if (rbno != NULLAGBLOCK &&
+                 rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+                /*
+                 * Set "left" to be the starting block,
+                 * "right" to be the right neighbor.
+                 */
+                lbno = bno;
+                left = block;
+                lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+                lbp = bp;
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.i.agno, rbno, 0, &rbp,
+                                XFS_INO_BTREE_REF)))
+                        return error;
+                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+                rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                        return error;
+        }
+        /*
+         * Otherwise, we can't fix the imbalance.
+         * Just return.  This is probably a logic error, but it's not fatal.
+         */
+        else {
+                if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
+                        return error;
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * We're now going to join "left" and "right" by moving all the stuff
+         * in "right" to "left" and deleting "right".
+         */
+        if (level > 0) {
+                /*
+                 * It's a non-leaf.  Move keys and pointers.
+                 */
+                lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
+                lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
+                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = 0; i < rrecs; i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
+                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
+                xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+                xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        } else {
+                /*
+                 * It's a leaf.  Move records.
+                 */
+                lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
+                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
+                xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        }
+        /*
+         * If we joined with the left neighbor, set the buffer in the
+         * cursor to the left block, and fix up the index.
+         */
+        if (bp != lbp) {
+                xfs_btree_setbuf(cur, level, lbp);
+                cur->bc_ptrs[level] += lrecs;
+        }
+        /*
+         * If we joined with the right neighbor and there's a level above
+         * us, increment the cursor at that level.
+         */
+        else if (level + 1 < cur->bc_nlevels &&
+                 (error = xfs_alloc_increment(cur, level + 1, &i)))
+                return error;
+        /*
+         * Fix up the number of records in the surviving block.
+         */
+        lrecs += rrecs;
+        INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs);
+        /*
+         * Fix up the right block pointer in the surviving block, and log it.
+         */
+        left->bb_rightsib = right->bb_rightsib;
+        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there is a right sibling now, make it point to the
+         * remaining block.
+         */
+        if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                xfs_inobt_block_t       *rrblock;
+                xfs_buf_t               *rrbp;
+                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+                                &rrbp, XFS_INO_BTREE_REF)))
+                        return error;
+                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+                        return error;
+                INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+                xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * Free the deleting block.
+         */
+        if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
+                                     cur->bc_private.i.agno, rbno), 1)))
+                return error;
+        xfs_trans_binval(cur->bc_tp, rbp);
+        /*
+         * Readjust the ptr at this level if it's not a leaf, since it's
+         * still pointing at the deletion point, which makes the cursor
+         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+         * We can't use decrement because it would change the next level up.
+         */
+        if (level > 0)
+                cur->bc_ptrs[level]--;
+        /*
+         * Return value means the next level up has something to do.
+         */
+        *stat = 2;
+        return 0;
+error0:
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int                              /* error */
+xfs_inobt_insrec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to insert record at */
+        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
+        xfs_inobt_rec_t         *recp,  /* i/o: record data inserted */
+        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
+        int                     *stat)  /* success/failure */
+{
+        xfs_inobt_block_t       *block; /* btree block record/key lives in */
+        xfs_buf_t               *bp;    /* buffer for block */
+        int                     error;  /* error return value */
+        int                     i;      /* loop index */
+        xfs_inobt_key_t         key;    /* key value being inserted */
+        xfs_inobt_key_t         *kp=NULL;       /* pointer to btree keys */
+        xfs_agblock_t           nbno;   /* block number of allocated block */
+        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
+        xfs_inobt_key_t         nkey;   /* new key value, from split */
+        xfs_inobt_rec_t         nrec;   /* new record value, for caller */
+        int                     numrecs;
+        int                     optr;   /* old ptr value */
+        xfs_inobt_ptr_t         *pp;    /* pointer to btree addresses */
+        int                     ptr;    /* index in btree block for this rec */
+        xfs_inobt_rec_t         *rp=NULL;       /* pointer to btree records */
+        /*
+         * If we made it to the root level, allocate a new root block
+         * and we're done.
+         */
+        if (level >= cur->bc_nlevels) {
+                error = xfs_inobt_newroot(cur, &i);
+                *bnop = NULLAGBLOCK;
+                *stat = i;
+                return error;
+        }
+        /*
+         * Make a key out of the record data to be inserted, and save it.
+         */
+        key.ir_startino = recp->ir_startino; /* INT_: direct copy */
+        optr = ptr = cur->bc_ptrs[level];
+        /*
+         * If we're off the left edge, return failure.
+         */
+        if (ptr == 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Get pointers to the btree buffer and block.
+         */
+        bp = cur->bc_bufs[level];
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+        numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                return error;
+        /*
+         * Check that the new entry is being inserted in the right place.
+         */
+        if (ptr <= numrecs) {
+                if (level == 0) {
+                        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
+                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
+                } else {
+                        kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
+                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
+                }
+        }
+#endif
+        nbno = NULLAGBLOCK;
+        ncur = (xfs_btree_cur_t *)0;
+        /*
+         * If the block is full, we can't insert the new entry until we
+         * make the block un-full.
+         */
+        if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+                /*
+                 * First, try shifting an entry to the right neighbor.
+                 */
+                if ((error = xfs_inobt_rshift(cur, level, &i)))
+                        return error;
+                if (i) {
+                        /* nothing */
+                }
+                /*
+                 * Next, try shifting an entry to the left neighbor.
+                 */
+                else {
+                        if ((error = xfs_inobt_lshift(cur, level, &i)))
+                                return error;
+                        if (i) {
+                                optr = ptr = cur->bc_ptrs[level];
+                        } else {
+                                /*
+                                 * Next, try splitting the current block
+                                 * in half. If this works we have to
+                                 * re-set our variables because
+                                 * we could be in a different block now.
+                                 */
+                                if ((error = xfs_inobt_split(cur, level, &nbno,
+                                                &nkey, &ncur, &i)))
+                                        return error;
+                                if (i) {
+                                        bp = cur->bc_bufs[level];
+                                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+                                        if ((error = xfs_btree_check_sblock(cur,
+                                                        block, level, bp)))
+                                                return error;
+#endif
+                                        ptr = cur->bc_ptrs[level];
+                                        nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */
+                                } else {
+                                        /*
+                                         * Otherwise the insert fails.
+                                         */
+                                        *stat = 0;
+                                        return 0;
+                                }
+                        }
+                }
+        }
+        /*
+         * At this point we know there's room for our new entry in the block
+         * we're pointing at.
+         */
+        numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+        if (level > 0) {
+                /*
+                 * It's a non-leaf entry.  Make a hole for the new data
+                 * in the key and ptr regions of the block.
+                 */
+                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+                for (i = numrecs; i >= ptr; i--) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memmove(&kp[ptr], &kp[ptr - 1],
+                        (numrecs - ptr + 1) * sizeof(*kp));
+                memmove(&pp[ptr], &pp[ptr - 1],
+                        (numrecs - ptr + 1) * sizeof(*pp));
+                /*
+                 * Now stuff the new data in, bump numrecs and log the new data.
+                 */
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
+                        return error;
+#endif
+                kp[ptr - 1] = key; /* INT_: struct copy */
+                INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+                numrecs++;
+                INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+                xfs_inobt_log_keys(cur, bp, ptr, numrecs);
+                xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
+        } else {
+                /*
+                 * It's a leaf entry.  Make a hole for the new record.
+                 */
+                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+                memmove(&rp[ptr], &rp[ptr - 1],
+                        (numrecs - ptr + 1) * sizeof(*rp));
+                /*
+                 * Now stuff the new record in, bump numrecs
+                 * and log the new data.
+                 */
+                rp[ptr - 1] = *recp; /* INT_: struct copy */
+                numrecs++;
+                INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+                xfs_inobt_log_recs(cur, bp, ptr, numrecs);
+        }
+        /*
+         * Log the new number of records in the btree header.
+         */
+        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+        /*
+         * Check that the key/record is in the right place, now.
+         */
+        if (ptr < numrecs) {
+                if (level == 0)
+                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
+                                rp + ptr);
+                else
+                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
+                                kp + ptr);
+        }
+#endif
+        /*
+         * If we inserted at the start of a block, update the parents' keys.
+         */
+        if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
+                return error;
+        /*
+         * Return the new block number, if any.
+         * If there is one, give back a record value and a cursor too.
+         */
+        *bnop = nbno;
+        if (nbno != NULLAGBLOCK) {
+                *recp = nrec; /* INT_: struct copy */
+                *curp = ncur;
+        }
+        *stat = 1;
+        return 0;
+}
+/*
+ * Log header fields from a btree block.
+ */
+STATIC void
+xfs_inobt_log_block(
+        xfs_trans_t             *tp,    /* transaction pointer */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     fields) /* mask of fields: XFS_BB_... */
+{
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        static const short      offsets[] = {   /* table of offsets */
+                offsetof(xfs_inobt_block_t, bb_magic),
+                offsetof(xfs_inobt_block_t, bb_level),
+                offsetof(xfs_inobt_block_t, bb_numrecs),
+                offsetof(xfs_inobt_block_t, bb_leftsib),
+                offsetof(xfs_inobt_block_t, bb_rightsib),
+                sizeof(xfs_inobt_block_t)
+        };
+        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+        xfs_trans_log_buf(tp, bp, first, last);
+}
+/*
+ * Log keys from a btree block (nonleaf).
+ */
+STATIC void
+xfs_inobt_log_keys(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     kfirst, /* index of first key to log */
+        int                     klast)  /* index of last key to log */
+{
+        xfs_inobt_block_t       *block; /* btree block to log from */
+        int                     first;  /* first byte offset logged */
+        xfs_inobt_key_t         *kp;    /* key pointer in btree block */
+        int                     last;   /* last byte offset logged */
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+        kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_inobt_log_ptrs(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     pfirst, /* index of first pointer to log */
+        int                     plast)  /* index of last pointer to log */
+{
+        xfs_inobt_block_t       *block; /* btree block to log from */
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        xfs_inobt_ptr_t         *pp;    /* block-pointer pointer in btree blk */
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+        pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+/*
+ * Log records from a btree block (leaf).
+ */
+STATIC void
+xfs_inobt_log_recs(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_buf_t               *bp,    /* buffer containing btree block */
+        int                     rfirst, /* index of first record to log */
+        int                     rlast)  /* index of last record to log */
+{
+        xfs_inobt_block_t       *block; /* btree block to log from */
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        xfs_inobt_rec_t         *rp;    /* record pointer for btree block */
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+        rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+STATIC int                              /* error */
+xfs_inobt_lookup(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        int                     *stat)  /* success/failure */
+{
+        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        xfs_agnumber_t          agno;   /* allocation group number */
+        xfs_inobt_block_t       *block=NULL;    /* current btree block */
+        __int64_t               diff;   /* difference for the current key */
+        int                     error;  /* error return value */
+        int                     keyno=0;        /* current key number */
+        int                     level;  /* level in the btree */
+        xfs_mount_t             *mp;    /* file system mount point */
+        /*
+         * Get the allocation group header, and the root block number.
+         */
+        mp = cur->bc_mp;
+        {
+                xfs_agi_t       *agi;   /* a.g. inode header */
+                agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+                agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
+                agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+        }
+        /*
+         * Iterate over each level in the btree, starting at the root.
+         * For each level above the leaves, find the key we need, based
+         * on the lookup record, then follow the corresponding block
+         * pointer down to the next level.
+         */
+        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+                xfs_buf_t       *bp;    /* buffer pointer for btree block */
+                xfs_daddr_t     d;      /* disk address of btree block */
+                /*
+                 * Get the disk address we're looking for.
+                 */
+                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+                /*
+                 * If the old buffer at this level is for a different block,
+                 * throw it away, otherwise just use it.
+                 */
+                bp = cur->bc_bufs[level];
+                if (bp && XFS_BUF_ADDR(bp) != d)
+                        bp = (xfs_buf_t *)0;
+                if (!bp) {
+                        /*
+                         * Need to get a new buffer.  Read it, then
+                         * set it in the cursor, releasing the old one.
+                         */
+                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+                                        agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
+                                return error;
+                        xfs_btree_setbuf(cur, level, bp);
+                        /*
+                         * Point to the btree block, now that we have the buffer
+                         */
+                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+                        if ((error = xfs_btree_check_sblock(cur, block, level,
+                                        bp)))
+                                return error;
+                } else
+                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+                /*
+                 * If we already had a key match at a higher level, we know
+                 * we need to use the first entry in this block.
+                 */
+                if (diff == 0)
+                        keyno = 1;
+                /*
+                 * Otherwise we need to search this block.  Do a binary search.
+                 */
+                else {
+                        int             high;   /* high entry number */
+                        xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
+                        xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
+                        int             low;    /* low entry number */
+                        /*
+                         * Get a pointer to keys or records.
+                         */
+                        if (level > 0)
+                                kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
+                        else
+                                krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
+                        /*
+                         * Set low and high entry numbers, 1-based.
+                         */
+                        low = 1;
+                        if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+                                /*
+                                 * If the block is empty, the tree must
+                                 * be an empty leaf.
+                                 */
+                                ASSERT(level == 0 && cur->bc_nlevels == 1);
+                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                                *stat = 0;
+                                return 0;
+                        }
+                        /*
+                         * Binary search the block.
+                         */
+                        while (low <= high) {
+                                xfs_agino_t     startino;       /* key value */
+                                /*
+                                 * keyno is average of low and high.
+                                 */
+                                keyno = (low + high) >> 1;
+                                /*
+                                 * Get startino.
+                                 */
+                                if (level > 0) {
+                                        xfs_inobt_key_t *kkp;
+                                        kkp = kkbase + keyno - 1;
+                                        startino = INT_GET(kkp->ir_startino, ARCH_CONVERT);
+                                } else {
+                                        xfs_inobt_rec_t *krp;
+                                        krp = krbase + keyno - 1;
+                                        startino = INT_GET(krp->ir_startino, ARCH_CONVERT);
+                                }
+                                /*
+                                 * Compute difference to get next direction.
+                                 */
+                                diff = (__int64_t)
+                                        startino - cur->bc_rec.i.ir_startino;
+                                /*
+                                 * Less than, move right.
+                                 */
+                                if (diff < 0)
+                                        low = keyno + 1;
+                                /*
+                                 * Greater than, move left.
+                                 */
+                                else if (diff > 0)
+                                        high = keyno - 1;
+                                /*
+                                 * Equal, we're done.
+                                 */
+                                else
+                                        break;
+                        }
+                }
+                /*
+                 * If there are more levels, set up for the next level
+                 * by getting the block number and filling in the cursor.
+                 */
+                if (level > 0) {
+                        /*
+                         * If we moved left, need the previous key number,
+                         * unless there isn't one.
+                         */
+                        if (diff > 0 && --keyno < 1)
+                                keyno = 1;
+                        agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
+#ifdef DEBUG
+                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
+                                return error;
+#endif
+                        cur->bc_ptrs[level] = keyno;
+                }
+        }
+        /*
+         * Done with the search.
+         * See if we need to adjust the results.
+         */
+        if (dir != XFS_LOOKUP_LE && diff < 0) {
+                keyno++;
+                /*
+                 * If ge search and we went off the end of the block, but it's
+                 * not the last block, we're in the wrong block.
+                 */
+                if (dir == XFS_LOOKUP_GE &&
+                    keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+                    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                        int     i;
+                        cur->bc_ptrs[0] = keyno;
+                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                return error;
+                        ASSERT(i == 1);
+                        *stat = 1;
+                        return 0;
+                }
+        }
+        else if (dir == XFS_LOOKUP_LE && diff > 0)
+                keyno--;
+        cur->bc_ptrs[0] = keyno;
+        /*
+         * Return if we succeeded or not.
+         */
+        if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
+                *stat = 0;
+        else
+                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+        return 0;
+}
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                              /* error */
+xfs_inobt_lshift(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to shift record on */
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+#ifdef DEBUG
+        int                     i;      /* loop index */
+#endif
+        xfs_inobt_key_t         key;    /* key value for leaf level upward */
+        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
+        xfs_inobt_block_t       *left;  /* left neighbor btree block */
+        xfs_inobt_key_t         *lkp=NULL;      /* key pointer for left block */
+        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
+        xfs_inobt_rec_t         *lrp=NULL;      /* record pointer for left block */
+        int                     nrec;   /* new number of left block entries */
+        xfs_buf_t               *rbp;   /* buffer for right (current) block */
+        xfs_inobt_block_t       *right; /* right (current) btree block */
+        xfs_inobt_key_t         *rkp=NULL;      /* key pointer for right block */
+        xfs_inobt_ptr_t         *rpp=NULL;      /* address pointer for right block */
+        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
+        /*
+         * Set up variables for this block as "right".
+         */
+        rbp = cur->bc_bufs[level];
+        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                return error;
+#endif
+        /*
+         * If we've got no left sibling then we can't shift an entry left.
+         */
+        if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] <= 1) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Set up the left neighbor as "left".
+         */
+        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.i.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
+                        XFS_INO_BTREE_REF)))
+                return error;
+        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                return error;
+        /*
+         * If it's full, it can't take another entry.
+         */
+        if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+                *stat = 0;
+                return 0;
+        }
+        nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+        /*
+         * If non-leaf, copy a key and a ptr to the left block.
+         */
+        if (level > 0) {
+                lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
+                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+                *lkp = *rkp;
+                xfs_inobt_log_keys(cur, lbp, nrec, nrec);
+                lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
+                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
+                        return error;
+#endif
+                *lpp = *rpp; /* INT_: no-change copy */
+                xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
+        }
+        /*
+         * If leaf, copy a record to the left block.
+         */
+        else {
+                lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
+                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+                *lrp = *rrp;
+                xfs_inobt_log_recs(cur, lbp, nrec, nrec);
+        }
+        /*
+         * Bump and log left's numrecs, decrement and log right's numrecs.
+         */
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
+        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+        if (level > 0)
+                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
+        else
+                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
+#endif
+        INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
+        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+        /*
+         * Slide the contents of right down one entry.
+         */
+        if (level > 0) {
+#ifdef DEBUG
+                for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+                                        level)))
+                                return error;
+                }
+#endif
+                memmove(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+                memmove(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+                xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+        } else {
+                memmove(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+                rkp = &key;
+        }
+        /*
+         * Update the parent key values of right.
+         */
+        if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
+                return error;
+        /*
+         * Slide the cursor value left one.
+         */
+        cur->bc_ptrs[level]--;
+        *stat = 1;
+        return 0;
+}
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                              /* error */
+xfs_inobt_newroot(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *stat)  /* success/failure */
+{
+        xfs_agi_t               *agi;   /* a.g. inode header */
+        xfs_alloc_arg_t         args;   /* allocation argument structure */
+        xfs_inobt_block_t       *block; /* one half of the old root block */
+        xfs_buf_t               *bp;    /* buffer containing block */
+        int                     error;  /* error return value */
+        xfs_inobt_key_t         *kp;    /* btree key pointer */
+        xfs_agblock_t           lbno;   /* left block number */
+        xfs_buf_t               *lbp;   /* left buffer pointer */
+        xfs_inobt_block_t       *left;  /* left btree block */
+        xfs_buf_t               *nbp;   /* new (root) buffer */
+        xfs_inobt_block_t       *new;   /* new (root) btree block */
+        int                     nptr;   /* new value for key index, 1 or 2 */
+        xfs_inobt_ptr_t         *pp;    /* btree address pointer */
+        xfs_agblock_t           rbno;   /* right block number */
+        xfs_buf_t               *rbp;   /* right buffer pointer */
+        xfs_inobt_block_t       *right; /* right btree block */
+        xfs_inobt_rec_t         *rp;    /* btree record pointer */
+        ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+        /*
+         * Get a block & a buffer.
+         */
+        agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+                INT_GET(agi->agi_root, ARCH_CONVERT));
+        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
+                args.isfl = args.userdata = args.minalignslop = 0;
+        args.minlen = args.maxlen = args.prod = 1;
+        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        if ((error = xfs_alloc_vextent(&args)))
+                return error;
+        /*
+         * None available, we fail.
+         */
+        if (args.fsbno == NULLFSBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
+        new = XFS_BUF_TO_INOBT_BLOCK(nbp);
+        /*
+         * Set the root data in the a.g. inode structure.
+         */
+        INT_SET(agi->agi_root, ARCH_CONVERT, args.agbno);
+        INT_MOD(agi->agi_level, ARCH_CONVERT, 1);
+        xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+                XFS_AGI_ROOT | XFS_AGI_LEVEL);
+        /*
+         * At the previous root level there are now two blocks: the old
+         * root, and the new block generated when it was split.
+         * We don't know which one the cursor is pointing at, so we
+         * set up variables "left" and "right" for each case.
+         */
+        bp = cur->bc_bufs[cur->bc_nlevels - 1];
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
+                return error;
+#endif
+        if (INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                /*
+                 * Our block is left, pick up the right block.
+                 */
+                lbp = bp;
+                lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
+                left = block;
+                rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+                                rbno, 0, &rbp, XFS_INO_BTREE_REF)))
+                        return error;
+                bp = rbp;
+                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+                if ((error = xfs_btree_check_sblock(cur, right,
+                                cur->bc_nlevels - 1, rbp)))
+                        return error;
+                nptr = 1;
+        } else {
+                /*
+                 * Our block is right, pick up the left block.
+                 */
+                rbp = bp;
+                rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
+                right = block;
+                lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+                                lbno, 0, &lbp, XFS_INO_BTREE_REF)))
+                        return error;
+                bp = lbp;
+                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+                if ((error = xfs_btree_check_sblock(cur, left,
+                                cur->bc_nlevels - 1, lbp)))
+                        return error;
+                nptr = 2;
+        }
+        /*
+         * Fill in the new block's btree header and log it.
+         */
+        INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+        INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
+        INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
+        INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+        INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+        xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
+        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
+        /*
+         * Fill in the key data in the new root.
+         */
+        kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
+        if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
+                kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */
+                kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */
+        } else {
+                rp = XFS_INOBT_REC_ADDR(left, 1, cur);
+                INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT);
+                rp = XFS_INOBT_REC_ADDR(right, 1, cur);
+                INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT);
+        }
+        xfs_inobt_log_keys(cur, nbp, 1, 2);
+        /*
+         * Fill in the pointer data in the new root.
+         */
+        pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
+        INT_SET(pp[0], ARCH_CONVERT, lbno);
+        INT_SET(pp[1], ARCH_CONVERT, rbno);
+        xfs_inobt_log_ptrs(cur, nbp, 1, 2);
+        /*
+         * Fix up the cursor.
+         */
+        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+        cur->bc_ptrs[cur->bc_nlevels] = nptr;
+        cur->bc_nlevels++;
+        *stat = 1;
+        return 0;
+}
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                              /* error */
+xfs_inobt_rshift(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to shift record on */
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        int                     i;      /* loop index */
+        xfs_inobt_key_t         key;    /* key value for leaf level upward */
+        xfs_buf_t               *lbp;   /* buffer for left (current) block */
+        xfs_inobt_block_t       *left;  /* left (current) btree block */
+        xfs_inobt_key_t         *lkp;   /* key pointer for left block */
+        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
+        xfs_inobt_rec_t         *lrp;   /* record pointer for left block */
+        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
+        xfs_inobt_block_t       *right; /* right neighbor btree block */
+        xfs_inobt_key_t         *rkp;   /* key pointer for right block */
+        xfs_inobt_ptr_t         *rpp;   /* address pointer for right block */
+        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
+        xfs_btree_cur_t         *tcur;  /* temporary cursor */
+        /*
+         * Set up variables for this block as "left".
+         */
+        lbp = cur->bc_bufs[level];
+        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                return error;
+#endif
+        /*
+         * If we've got no right sibling then we can't shift an entry right.
+         */
+        if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Set up the right neighbor as "right".
+         */
+        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
+                        XFS_INO_BTREE_REF)))
+                return error;
+        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+                return error;
+        /*
+         * If it's full, it can't take another entry.
+         */
+        if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Make a hole at the start of the right neighbor block, then
+         * copy the last left block entry to the hole.
+         */
+        if (level > 0) {
+                lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memmove(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+                memmove(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
+                        return error;
+#endif
+                *rkp = *lkp; /* INT_: no change copy */
+                *rpp = *lpp; /* INT_: no change copy */
+                xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+                xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+        } else {
+                lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+                memmove(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                *rrp = *lrp;
+                xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+                key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+                rkp = &key;
+        }
+        /*
+         * Decrement and log left's numrecs, bump and log right's numrecs.
+         */
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+        INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+#ifdef DEBUG
+        if (level > 0)
+                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
+        else
+                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
+#endif
+        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+        /*
+         * Using a temporary cursor, update the parent key values of the
+         * block on the right.
+         */
+        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+                return error;
+        xfs_btree_lastrec(tcur, level);
+        if ((error = xfs_inobt_increment(tcur, level, &i)) ||
+            (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
+                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+                return error;
+        }
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int                              /* error */
+xfs_inobt_split(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level to split */
+        xfs_agblock_t           *bnop,  /* output: block number allocated */
+        xfs_inobt_key_t         *keyp,  /* output: first key of new block */
+        xfs_btree_cur_t         **curp, /* output: new cursor */
+        int                     *stat)  /* success/failure */
+{
+        xfs_alloc_arg_t         args;   /* allocation argument structure */
+        int                     error;  /* error return value */
+        int                     i;      /* loop index/record number */
+        xfs_agblock_t           lbno;   /* left (current) block number */
+        xfs_buf_t               *lbp;   /* buffer for left block */
+        xfs_inobt_block_t       *left;  /* left (current) btree block */
+        xfs_inobt_key_t         *lkp;   /* left btree key pointer */
+        xfs_inobt_ptr_t         *lpp;   /* left btree address pointer */
+        xfs_inobt_rec_t         *lrp;   /* left btree record pointer */
+        xfs_buf_t               *rbp;   /* buffer for right block */
+        xfs_inobt_block_t       *right; /* right (new) btree block */
+        xfs_inobt_key_t         *rkp;   /* right btree key pointer */
+        xfs_inobt_ptr_t         *rpp;   /* right btree address pointer */
+        xfs_inobt_rec_t         *rrp;   /* right btree record pointer */
+        /*
+         * Set up left block (current one).
+         */
+        lbp = cur->bc_bufs[level];
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
+        /*
+         * Allocate the new block.
+         * If we can't do it, we're toast.  Give up.
+         */
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
+                args.isfl = args.userdata = args.minalignslop = 0;
+        args.minlen = args.maxlen = args.prod = 1;
+        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        if ((error = xfs_alloc_vextent(&args)))
+                return error;
+        if (args.fsbno == NULLFSBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
+        /*
+         * Set up the new block as "right".
+         */
+        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+        /*
+         * "Left" is the current (according to the cursor) block.
+         */
+        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+                return error;
+#endif
+        /*
+         * Fill in the btree header for the new block.
+         */
+        INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+        right->bb_level = left->bb_level; /* INT_: direct copy */
+        INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+        /*
+         * Make sure that if there's an odd number of entries now, that
+         * each new block will have the same number of entries.
+         */
+        if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+            cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+                INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+        i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+        /*
+         * For non-leaf blocks, copy keys and addresses over to the new block.
+         */
+        if (level > 0) {
+                lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
+                lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
+                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+                for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+                        if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+                                return error;
+                }
+#endif
+                memcpy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+                memcpy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+                xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                *keyp = *rkp;
+        }
+        /*
+         * For leaf blocks, copy records over to the new block.
+         */
+        else {
+                lrp = XFS_INOBT_REC_ADDR(left, i, cur);
+                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+                memcpy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+                xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+                keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */
+        }
+        /*
+         * Find the left block number by looking in the buffer.
+         * Adjust numrecs, sibling pointers.
+         */
+        INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+        right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+        INT_SET(left->bb_rightsib, ARCH_CONVERT, args.agbno);
+        INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+        xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
+        xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there's a block to the new block's right, make that block
+         * point back to right instead of to left.
+         */
+        if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+                xfs_inobt_block_t       *rrblock;       /* rr btree block */
+                xfs_buf_t               *rrbp;          /* buffer for rrblock */
+                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+                                INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+                                XFS_INO_BTREE_REF)))
+                        return error;
+                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+                        return error;
+                INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.agbno);
+                xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * If the cursor is really in the right block, move it there.
+         * If it's just pointing past the last entry in left, then we'll
+         * insert there, so don't change anything in that case.
+         */
+        if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+                xfs_btree_setbuf(cur, level, rbp);
+                cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+        }
+        /*
+         * If there are more levels, we'll need another cursor which refers
+         * the right block, no matter where this cursor was.
+         */
+        if (level + 1 < cur->bc_nlevels) {
+                if ((error = xfs_btree_dup_cursor(cur, curp)))
+                        return error;
+                (*curp)->bc_ptrs[level + 1]++;
+        }
+        *bnop = args.agbno;
+        *stat = 1;
+        return 0;
+}
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int                              /* error */
+xfs_inobt_updkey(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_inobt_key_t         *keyp,  /* new key value to update to */
+        int                     level)  /* starting level for update */
+{
+        int                     ptr;    /* index of key in block */
+        /*
+         * Go up the tree from this level toward the root.
+         * At each level, update the key value to the value input.
+         * Stop when we reach a level where the cursor isn't pointing
+         * at the first entry in the block.
+         */
+        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+                xfs_buf_t               *bp;    /* buffer for block */
+                xfs_inobt_block_t       *block; /* btree block */
+#ifdef DEBUG
+                int                     error;  /* error return value */
+#endif
+                xfs_inobt_key_t         *kp;    /* ptr to btree block keys */
+                bp = cur->bc_bufs[level];
+                block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                        return error;
+#endif
+                ptr = cur->bc_ptrs[level];
+                kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
+                *kp = *keyp;
+                xfs_inobt_log_keys(cur, bp, ptr, ptr);
+        }
+        return 0;
+}
+/*
+ * Externally visible routines.
+ */
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_inobt_decrement(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat)  /* success/failure */
+{
+        xfs_inobt_block_t       *block; /* btree block */
+        int                     error;
+        int                     lev;    /* btree level */
+        ASSERT(level < cur->bc_nlevels);
+        /*
+         * Read-ahead to the left at this level.
+         */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+        /*
+         * Decrement the ptr at this level.  If we're still in the block
+         * then we're done.
+         */
+        if (--cur->bc_ptrs[level] > 0) {
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * Get a pointer to the btree block.
+         */
+        block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level,
+                        cur->bc_bufs[level])))
+                return error;
+#endif
+        /*
+         * If we just went off the left edge of the tree, return failure.
+         */
+        if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * March up the tree decrementing pointers.
+         * Stop when we don't go off the left edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                if (--cur->bc_ptrs[lev] > 0)
+                        break;
+                /*
+                 * Read-ahead the left block, we're going to read it
+                 * in the next loop.
+                 */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         */
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
+                xfs_agblock_t   agbno;  /* block number of btree block */
+                xfs_buf_t       *bp;    /* buffer containing btree block */
+                agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                XFS_INO_BTREE_REF)))
+                        return error;
+                lev--;
+                xfs_btree_setbuf(cur, lev, bp);
+                block = XFS_BUF_TO_INOBT_BLOCK(bp);
+                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+                        return error;
+                cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+        }
+        *stat = 1;
+        return 0;
+}
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_inobt_delete(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        int             *stat)          /* success/failure */
+{
+        int             error;
+        int             i;              /* result code */
+        int             level;          /* btree level */
+        /*
+         * Go up the tree, starting at leaf level.
+         * If 2 is returned then a join was done; go to the next level.
+         * Otherwise we are done.
+         */
+        for (level = 0, i = 2; i == 2; level++) {
+                if ((error = xfs_inobt_delrec(cur, level, &i)))
+                        return error;
+        }
+        if (i == 0) {
+                for (level = 1; level < cur->bc_nlevels; level++) {
+                        if (cur->bc_ptrs[level] == 0) {
+                                if ((error = xfs_inobt_decrement(cur, level, &i)))
+                                        return error;
+                                break;
+                        }
+                }
+        }
+        *stat = i;
+        return 0;
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_inobt_get_rec(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_agino_t             *ino,   /* output: starting inode of chunk */
+        __int32_t               *fcnt,  /* output: number of free inodes */
+        xfs_inofree_t           *free,  /* output: free inode mask */
+        int                     *stat)  /* output: success/failure */
+{
+        xfs_inobt_block_t       *block; /* btree block */
+        xfs_buf_t               *bp;    /* buffer containing btree block */
+#ifdef DEBUG
+        int                     error;  /* error return value */
+#endif
+        int                     ptr;    /* record number */
+        xfs_inobt_rec_t         *rec;   /* record data */
+        bp = cur->bc_bufs[0];
+        ptr = cur->bc_ptrs[0];
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
+                return error;
+#endif
+        /*
+         * Off the right end or left end, return failure.
+         */
+        if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Point to the record and extract its data.
+         */
+        rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
+        *ino = INT_GET(rec->ir_startino, ARCH_CONVERT);
+        *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT);
+        *free = INT_GET(rec->ir_free, ARCH_CONVERT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_inobt_increment(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat)  /* success/failure */
+{
+        xfs_inobt_block_t       *block; /* btree block */
+        xfs_buf_t               *bp;    /* buffer containing btree block */
+        int                     error;  /* error return value */
+        int                     lev;    /* btree level */
+        ASSERT(level < cur->bc_nlevels);
+        /*
+         * Read-ahead to the right at this level.
+         */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        /*
+         * Get a pointer to the btree block.
+         */
+        bp = cur->bc_bufs[level];
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+                return error;
+#endif
+        /*
+         * Increment the ptr at this level.  If we're still in the block
+         * then we're done.
+         */
+        if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we just went off the right edge of the tree, return failure.
+         */
+        if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * March up the tree incrementing pointers.
+         * Stop when we don't go off the right edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                bp = cur->bc_bufs[lev];
+                block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+                        return error;
+#endif
+                if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+                        break;
+                /*
+                 * Read-ahead the right block, we're going to read it
+                 * in the next loop.
+                 */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         */
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
+             lev > level; ) {
+                xfs_agblock_t   agbno;  /* block number of btree block */
+                agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+                                cur->bc_private.i.agno, agbno, 0, &bp,
+                                XFS_INO_BTREE_REF)))
+                        return error;
+                lev--;
+                xfs_btree_setbuf(cur, lev, bp);
+                block = XFS_BUF_TO_INOBT_BLOCK(bp);
+                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+                        return error;
+                cur->bc_ptrs[lev] = 1;
+        }
+        *stat = 1;
+        return 0;
+}
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int                                     /* error */
+xfs_inobt_insert(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        int             *stat)          /* success/failure */
+{
+        int             error;          /* error return value */
+        int             i;              /* result value, 0 for failure */
+        int             level;          /* current level number in btree */
+        xfs_agblock_t   nbno;           /* new block number (split result) */
+        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
+        xfs_inobt_rec_t nrec;           /* record being inserted this level */
+        xfs_btree_cur_t *pcur;          /* previous level's cursor */
+        level = 0;
+        nbno = NULLAGBLOCK;
+        INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino);
+        INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount);
+        INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free);
+        ncur = (xfs_btree_cur_t *)0;
+        pcur = cur;
+        /*
+         * Loop going up the tree, starting at the leaf level.
+         * Stop when we don't get a split block, that must mean that
+         * the insert is finished with this level.
+         */
+        do {
+                /*
+                 * Insert nrec/nbno into this level of the tree.
+                 * Note if we fail, nbno will be null.
+                 */
+                if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
+                                &i))) {
+                        if (pcur != cur)
+                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                        return error;
+                }
+                /*
+                 * See if the cursor we just used is trash.
+                 * Can't trash the caller's cursor, but otherwise we should
+                 * if ncur is a new cursor or we're about to be done.
+                 */
+                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
+                        cur->bc_nlevels = pcur->bc_nlevels;
+                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+                }
+                /*
+                 * If we got a new cursor, switch to it.
+                 */
+                if (ncur) {
+                        pcur = ncur;
+                        ncur = (xfs_btree_cur_t *)0;
+                }
+        } while (nbno != NULLAGBLOCK);
+        *stat = i;
+        return 0;
+}
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_eq(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_agino_t     ino,            /* starting inode of chunk */
+        __int32_t       fcnt,           /* free inode count */
+        xfs_inofree_t   free,           /* free inode mask */
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_ge(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_agino_t     ino,            /* starting inode of chunk */
+        __int32_t       fcnt,           /* free inode count */
+        xfs_inofree_t   free,           /* free inode mask */
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_le(
+        xfs_btree_cur_t *cur,           /* btree cursor */
+        xfs_agino_t     ino,            /* starting inode of chunk */
+        __int32_t       fcnt,           /* free inode count */
+        xfs_inofree_t   free,           /* free inode mask */
+        int             *stat)          /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur, to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int                                     /* error */
+xfs_inobt_update(
+        xfs_btree_cur_t         *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free)   /* free inode mask */
+{
+        xfs_inobt_block_t       *block; /* btree block to update */
+        xfs_buf_t               *bp;    /* buffer containing btree block */
+        int                     error;  /* error return value */
+        int                     ptr;    /* current record number (updating) */
+        xfs_inobt_rec_t         *rp;    /* pointer to updated record */
+        /*
+         * Pick up the current block.
+         */
+        bp = cur->bc_bufs[0];
+        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
+                return error;
+#endif
+        /*
+         * Get the address of the rec to be updated.
+         */
+        ptr = cur->bc_ptrs[0];
+        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
+        /*
+         * Fill in the new contents and log them.
+         */
+        INT_SET(rp->ir_startino, ARCH_CONVERT, ino);
+        INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt);
+        INT_SET(rp->ir_free, ARCH_CONVERT, free);
+        xfs_inobt_log_recs(cur, bp, ptr, ptr);
+        /*
+         * Updating first record in leaf. Pass new key value up to our parent.
+         */
+        if (ptr == 1) {
+                xfs_inobt_key_t key;    /* key containing [ino] */
+                INT_SET(key.ir_startino, ARCH_CONVERT, ino);
+                if ((error = xfs_inobt_updkey(cur, &key, 1)))
+                        return error;
+        }
+        return 0;
+}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000000..803c4d17a057
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define __XFS_IALLOC_BTREE_H__
+/*
+ * Inode map on-disk structures
+ */
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_btree_sblock;
+struct xfs_mount;
+/*
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC   0x49414254      /* 'IABT' */
+typedef __uint64_t      xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK    (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG        (XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE      ((xfs_inofree_t)-1)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASKN)
+xfs_inofree_t xfs_inobt_maskn(int i, int n);
+#define XFS_INOBT_MASKN(i,n)            xfs_inobt_maskn(i,n)
+#else
+#define XFS_INOBT_MASKN(i,n)    \
+        ((((n) >= XFS_INODES_PER_CHUNK ? \
+                (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i))
+#endif
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec
+{
+        xfs_agino_t     ir_startino;    /* starting inode number */
+        __int32_t       ir_freecount;   /* count of free inodes (set bits) */
+        xfs_inofree_t   ir_free;        /* free inode mask */
+} xfs_inobt_rec_t;
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key
+{
+        xfs_agino_t     ir_startino;    /* starting inode number */
+} xfs_inobt_key_t;
+typedef xfs_agblock_t xfs_inobt_ptr_t;  /* btree pointer type */
+                                        /* btree block header type */
+typedef struct xfs_btree_sblock xfs_inobt_block_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
+xfs_inobt_block_t *xfs_buf_to_inobt_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_INOBT_BLOCK(bp)      xfs_buf_to_inobt_block(bp)
+#else
+#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+/*
+ * Bit manipulations for ir_free.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASK)
+xfs_inofree_t xfs_inobt_mask(int i);
+#define XFS_INOBT_MASK(i)               xfs_inobt_mask(i)
+#else
+#define XFS_INOBT_MASK(i)               ((xfs_inofree_t)1 << (i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_FREE)
+int xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i);
+#define XFS_INOBT_IS_FREE(rp,i) xfs_inobt_is_free(rp,i)
+#else
+#define XFS_INOBT_IS_FREE(rp,i) (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_SET_FREE)
+void xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i);
+#define XFS_INOBT_SET_FREE(rp,i)        xfs_inobt_set_free(rp,i)
+#else
+#define XFS_INOBT_SET_FREE(rp,i)        ((rp)->ir_free |= XFS_INOBT_MASK(i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_CLR_FREE)
+void xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i);
+#define XFS_INOBT_CLR_FREE(rp,i)        xfs_inobt_clr_free(rp,i)
+#else
+#define XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
+#endif
+/*
+ * Real block structures have a size equal to the disk block size.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_SIZE)
+int xfs_inobt_block_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_SIZE(lev,cur)   xfs_inobt_block_size(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_SIZE(lev,cur)   (1 << (cur)->bc_blocklog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
+int xfs_inobt_block_maxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_MAXRECS(lev,cur)        xfs_inobt_block_maxrecs(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_MAXRECS(lev,cur)        \
+        ((cur)->bc_mp->m_inobt_mxr[lev != 0])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MINRECS)
+int xfs_inobt_block_minrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_MINRECS(lev,cur)        xfs_inobt_block_minrecs(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_MINRECS(lev,cur)        \
+        ((cur)->bc_mp->m_inobt_mnr[lev != 0])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_LAST_REC)
+int xfs_inobt_is_last_rec(struct xfs_btree_cur *cur);
+#define XFS_INOBT_IS_LAST_REC(cur)      xfs_inobt_is_last_rec(cur)
+#else
+#define XFS_INOBT_IS_LAST_REC(cur)      \
+        ((cur)->bc_ptrs[0] == \
+                INT_GET(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs, ARCH_CONVERT))
+#endif
+/*
+ * Maximum number of inode btree levels.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IN_MAXLEVELS)
+int xfs_in_maxlevels(struct xfs_mount *mp);
+#define XFS_IN_MAXLEVELS(mp)            xfs_in_maxlevels(mp)
+#else
+#define XFS_IN_MAXLEVELS(mp)            ((mp)->m_in_maxlevels)
+#endif
+/*
+ * block numbers in the AG.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IBT_BLOCK)
+xfs_agblock_t xfs_ibt_block(struct xfs_mount *mp);
+#define XFS_IBT_BLOCK(mp)               xfs_ibt_block(mp)
+#else
+#define XFS_IBT_BLOCK(mp)       ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_PREALLOC_BLOCKS)
+xfs_agblock_t xfs_prealloc_blocks(struct xfs_mount *mp);
+#define XFS_PREALLOC_BLOCKS(mp)         xfs_prealloc_blocks(mp)
+#else
+#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+#endif
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_REC_ADDR)
+xfs_inobt_rec_t *
+xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_REC_ADDR(bb,i,cur)    xfs_inobt_rec_addr(bb,i,cur)
+#else
+#define XFS_INOBT_REC_ADDR(bb,i,cur)    \
+        XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, i, \
+                XFS_INOBT_BLOCK_MAXRECS(0, cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_KEY_ADDR)
+xfs_inobt_key_t *
+xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_KEY_ADDR(bb,i,cur)    xfs_inobt_key_addr(bb,i,cur)
+#else
+#define XFS_INOBT_KEY_ADDR(bb,i,cur)    \
+        XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
+                XFS_INOBT_BLOCK_MAXRECS(1, cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_PTR_ADDR)
+xfs_inobt_ptr_t *
+xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_PTR_ADDR(bb,i,cur)    xfs_inobt_ptr_addr(bb,i,cur)
+#else
+#define XFS_INOBT_PTR_ADDR(bb,i,cur)    \
+        XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
+                XFS_INOBT_BLOCK_MAXRECS(1, cur))
+#endif
+/*
+ * Prototypes for externally visible routines.
+ */
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_inobt_decrement(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat); /* success/failure */
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_inobt_delete(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat); /* success/failure */
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_inobt_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             *ino,   /* output: starting inode of chunk */
+        __int32_t               *fcnt,  /* output: number of free inodes */
+        xfs_inofree_t           *free,  /* output: free inode mask */
+        int                     *stat); /* output: success/failure */
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                     /* error */
+xfs_inobt_increment(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in btree, 0 is leaf */
+        int                     *stat); /* success/failure */
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int                                     /* error */
+xfs_inobt_insert(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat); /* success/failure */
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat); /* success/failure */
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat); /* success/failure */
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat); /* success/failure */
+/*
+ * Update the record referred to by cur, to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int                                     /* error */
+xfs_inobt_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free);  /* free inode mask */
+#endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
new file mode 100644
index 000000000000..3a0ba1dfd0e8
--- /dev/null
+++ b/fs/xfs/xfs_iget.c
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_bit.h"
+/*
+ * Initialize the inode hash table for the newly mounted file system.
+ * Choose an initial table size based on user specified value, else
+ * use a simple algorithm using the maximum number of inodes as an
+ * indicator for table size, and clamp it between one and some large
+ * number of pages.
+ */
+void
+xfs_ihash_init(xfs_mount_t *mp)
+{
+        __uint64_t      icount;
+        uint            i, flags = KM_SLEEP | KM_MAYFAIL;
+        if (!mp->m_ihsize) {
+                icount = mp->m_maxicount ? mp->m_maxicount :
+                         (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
+                mp->m_ihsize = 1 << max_t(uint, 8,
+                                        (xfs_highbit64(icount) + 1) / 2);
+                mp->m_ihsize = min_t(uint, mp->m_ihsize,
+                                        (64 * NBPP) / sizeof(xfs_ihash_t));
+        }
+        while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize *
+                                                sizeof(xfs_ihash_t), flags))) {
+                if ((mp->m_ihsize >>= 1) <= NBPP)
+                        flags = KM_SLEEP;
+        }
+        for (i = 0; i < mp->m_ihsize; i++) {
+                rwlock_init(&(mp->m_ihash[i].ih_lock));
+        }
+}
+/*
+ * Free up structures allocated by xfs_ihash_init, at unmount time.
+ */
+void
+xfs_ihash_free(xfs_mount_t *mp)
+{
+        kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t));
+        mp->m_ihash = NULL;
+}
+/*
+ * Initialize the inode cluster hash table for the newly mounted file system.
+ * Its size is derived from the ihash table size.
+ */
+void
+xfs_chash_init(xfs_mount_t *mp)
+{
+        uint    i;
+        mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
+                         (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
+        mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
+        mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
+                                                 * sizeof(xfs_chash_t),
+                                                 KM_SLEEP);
+        for (i = 0; i < mp->m_chsize; i++) {
+                spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
+        }
+}
+/*
+ * Free up structures allocated by xfs_chash_init, at unmount time.
+ */
+void
+xfs_chash_free(xfs_mount_t *mp)
+{
+        int     i;
+        for (i = 0; i < mp->m_chsize; i++) {
+                spinlock_destroy(&mp->m_chash[i].ch_lock);
+        }
+        kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
+        mp->m_chash = NULL;
+}
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the hash table for the file system
+ * represented by the mount point parameter mp.  Each bucket of
+ * the hash table is guarded by an individual semaphore.
+ *
+ * If the inode is found in the hash table, its corresponding vnode
+ * is obtained with a call to vn_get().  This call takes care of
+ * coordination with the reclamation of the inode and vnode.  Note
+ * that the vmap structure is filled in while holding the hash lock.
+ * This gives us the state of the inode/vnode when we found it and
+ * is used for coordination in vn_get().
+ *
+ * If it is not in core, read it in from the file system's device and
+ * add the inode into the hash table.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *               for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *        if known (as by bulkstat), else 0.
+ */
+STATIC int
+xfs_iget_core(
+        vnode_t         *vp,
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        uint            flags,
+        uint            lock_flags,
+        xfs_inode_t     **ipp,
+        xfs_daddr_t     bno)
+{
+        xfs_ihash_t     *ih;
+        xfs_inode_t     *ip;
+        xfs_inode_t     *iq;
+        vnode_t         *inode_vp;
+        ulong           version;
+        int             error;
+        /* REFERENCED */
+        xfs_chash_t     *ch;
+        xfs_chashlist_t *chl, *chlnew;
+        SPLDECL(s);
+        ih = XFS_IHASH(mp, ino);
+again:
+        read_lock(&ih->ih_lock);
+        for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+                if (ip->i_ino == ino) {
+                        /*
+                         * If INEW is set this inode is being set up
+                         * we need to pause and try again.
+                         */
+                        if (ip->i_flags & XFS_INEW) {
+                                read_unlock(&ih->ih_lock);
+                                delay(1);
+                                XFS_STATS_INC(xs_ig_frecycle);
+                                goto again;
+                        }
+                        inode_vp = XFS_ITOV_NULL(ip);
+                        if (inode_vp == NULL) {
+                                /*
+                                 * If IRECLAIM is set this inode is
+                                 * on its way out of the system,
+                                 * we need to pause and try again.
+                                 */
+                                if (ip->i_flags & XFS_IRECLAIM) {
+                                        read_unlock(&ih->ih_lock);
+                                        delay(1);
+                                        XFS_STATS_INC(xs_ig_frecycle);
+                                        goto again;
+                                }
+                                vn_trace_exit(vp, "xfs_iget.alloc",
+                                        (inst_t *)__return_address);
+                                XFS_STATS_INC(xs_ig_found);
+                                ip->i_flags &= ~XFS_IRECLAIMABLE;
+                                read_unlock(&ih->ih_lock);
+                                XFS_MOUNT_ILOCK(mp);
+                                list_del_init(&ip->i_reclaim);
+                                XFS_MOUNT_IUNLOCK(mp);
+                                goto finish_inode;
+                        } else if (vp != inode_vp) {
+                                struct inode *inode = LINVFS_GET_IP(inode_vp);
+                                /* The inode is being torn down, pause and
+                                 * try again.
+                                 */
+                                if (inode->i_state & (I_FREEING | I_CLEAR)) {
+                                        read_unlock(&ih->ih_lock);
+                                        delay(1);
+                                        XFS_STATS_INC(xs_ig_frecycle);
+                                        goto again;
+                                }
+/* Chances are the other vnode (the one in the inode) is being torn
+ * down right now, and we landed on top of it. Question is, what do
+ * we do? Unhook the old inode and hook up the new one?
+ */
+                                cmn_err(CE_PANIC,
+                        "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
+                                                inode_vp, vp);
+                        }
+                        read_unlock(&ih->ih_lock);
+                        XFS_STATS_INC(xs_ig_found);
+finish_inode:
+                        if (ip->i_d.di_mode == 0) {
+                                if (!(flags & IGET_CREATE))
+                                        return ENOENT;
+                                xfs_iocore_inode_reinit(ip);
+                        }
+        
+                        if (lock_flags != 0)
+                                xfs_ilock(ip, lock_flags);
+                        ip->i_flags &= ~XFS_ISTALE;
+                        vn_trace_exit(vp, "xfs_iget.found",
+                                                (inst_t *)__return_address);
+                        goto return_ip;
+                }
+        }
+        /*
+         * Inode cache miss: save the hash chain version stamp and unlock
+         * the chain, so we don't deadlock in vn_alloc.
+         */
+        XFS_STATS_INC(xs_ig_missed);
+        version = ih->ih_version;
+        read_unlock(&ih->ih_lock);
+        /*
+         * Read the disk inode attributes into a new inode structure and get
+         * a new vnode for it. This should also initialize i_ino and i_mount.
+         */
+        error = xfs_iread(mp, tp, ino, &ip, bno);
+        if (error) {
+                return error;
+        }
+        vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
+        xfs_inode_lock_init(ip, vp);
+        xfs_iocore_inode_init(ip);
+        if (lock_flags != 0) {
+                xfs_ilock(ip, lock_flags);
+        }
+                
+        if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) {
+                xfs_idestroy(ip);
+                return ENOENT;
+        }
+        /*
+         * Put ip on its hash chain, unless someone else hashed a duplicate
+         * after we released the hash lock.
+         */
+        write_lock(&ih->ih_lock);
+        if (ih->ih_version != version) {
+                for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
+                        if (iq->i_ino == ino) {
+                                write_unlock(&ih->ih_lock);
+                                xfs_idestroy(ip);
+                                XFS_STATS_INC(xs_ig_dup);
+                                goto again;
+                        }
+                }
+        }
+        /*
+         * These values _must_ be set before releasing ihlock!
+         */
+        ip->i_hash = ih;
+        if ((iq = ih->ih_next)) {
+                iq->i_prevp = &ip->i_next;
+        }
+        ip->i_next = iq;
+        ip->i_prevp = &ih->ih_next;
+        ih->ih_next = ip;
+        ip->i_udquot = ip->i_gdquot = NULL;
+        ih->ih_version++;
+        ip->i_flags |= XFS_INEW;
+        write_unlock(&ih->ih_lock);
+        /*
+         * put ip on its cluster's hash chain
+         */
+        ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
+               ip->i_cnext == NULL);
+        chlnew = NULL;
+        ch = XFS_CHASH(mp, ip->i_blkno);
+ chlredo:
+        s = mutex_spinlock(&ch->ch_lock);
+        for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
+                if (chl->chl_blkno == ip->i_blkno) {
+                        /* insert this inode into the doubly-linked list
+                         * where chl points */
+                        if ((iq = chl->chl_ip)) {
+                                ip->i_cprev = iq->i_cprev;
+                                iq->i_cprev->i_cnext = ip;
+                                iq->i_cprev = ip;
+                                ip->i_cnext = iq;
+                        } else {
+                                ip->i_cnext = ip;
+                                ip->i_cprev = ip;
+                        }
+                        chl->chl_ip = ip;
+                        ip->i_chash = chl;
+                        break;
+                }
+        }
+        /* no hash list found for this block; add a new hash list */
+        if (chl == NULL)  {
+                if (chlnew == NULL) {
+                        mutex_spinunlock(&ch->ch_lock, s);
+                        ASSERT(xfs_chashlist_zone != NULL);
+                        chlnew = (xfs_chashlist_t *)
+                                        kmem_zone_alloc(xfs_chashlist_zone,
+                                                KM_SLEEP);
+                        ASSERT(chlnew != NULL);
+                        goto chlredo;
+                } else {
+                        ip->i_cnext = ip;
+                        ip->i_cprev = ip;
+                        ip->i_chash = chlnew;
+                        chlnew->chl_ip = ip;
+                        chlnew->chl_blkno = ip->i_blkno;
+                        chlnew->chl_next = ch->ch_list;
+                        ch->ch_list = chlnew;
+                        chlnew = NULL;
+                }
+        } else {
+                if (chlnew != NULL) {
+                        kmem_zone_free(xfs_chashlist_zone, chlnew);
+                }
+        }
+        mutex_spinunlock(&ch->ch_lock, s);
+        /*
+         * Link ip to its mount and thread it on the mount's inode list.
+         */
+        XFS_MOUNT_ILOCK(mp);
+        if ((iq = mp->m_inodes)) {
+                ASSERT(iq->i_mprev->i_mnext == iq);
+                ip->i_mprev = iq->i_mprev;
+                iq->i_mprev->i_mnext = ip;
+                iq->i_mprev = ip;
+                ip->i_mnext = iq;
+        } else {
+                ip->i_mnext = ip;
+                ip->i_mprev = ip;
+        }
+        mp->m_inodes = ip;
+        XFS_MOUNT_IUNLOCK(mp);
+ return_ip:
+        ASSERT(ip->i_df.if_ext_max ==
+               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
+        ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+               ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
+        *ipp = ip;
+        /*
+         * If we have a real type for an on-disk inode, we can set ops(&unlock)
+         * now.  If it's a new inode being created, xfs_ialloc will handle it.
+         */
+        VFS_INIT_VNODE(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1);
+        return 0;
+}
+/*
+ * The 'normal' internal xfs_iget, if needed it will
+ * 'allocate', or 'get', the vnode.
+ */
+int
+xfs_iget(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        uint            flags,
+        uint            lock_flags,
+        xfs_inode_t     **ipp,
+        xfs_daddr_t     bno)
+{
+        struct inode    *inode;
+        vnode_t         *vp = NULL;
+        int             error;
+retry:
+        XFS_STATS_INC(xs_ig_attempts);
+        if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
+                bhv_desc_t      *bdp;
+                xfs_inode_t     *ip;
+                int             newnode;
+                vp = LINVFS_GET_VP(inode);
+                if (inode->i_state & I_NEW) {
+inode_allocate:
+                        vn_initialize(inode);
+                        error = xfs_iget_core(vp, mp, tp, ino, flags,
+                                        lock_flags, ipp, bno);
+                        if (error) {
+                                vn_mark_bad(vp);
+                                if (inode->i_state & I_NEW)
+                                        unlock_new_inode(inode);
+                                iput(inode);
+                        }
+                } else {
+                        /* These are true if the inode is in inactive or
+                         * reclaim. The linux inode is about to go away,
+                         * wait for that path to finish, and try again.
+                         */
+                        if (vp->v_flag & (VINACT | VRECLM)) {
+                                vn_wait(vp);
+                                iput(inode);
+                                goto retry;
+                        }
+                        if (is_bad_inode(inode)) {
+                                iput(inode);
+                                return EIO;
+                        }
+                        bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
+                        if (bdp == NULL) {
+                                XFS_STATS_INC(xs_ig_dup);
+                                goto inode_allocate;
+                        }
+                        ip = XFS_BHVTOI(bdp);
+                        if (lock_flags != 0)
+                                xfs_ilock(ip, lock_flags);
+                        newnode = (ip->i_d.di_mode == 0);
+                        if (newnode)
+                                xfs_iocore_inode_reinit(ip);
+                        XFS_STATS_INC(xs_ig_found);
+                        *ipp = ip;
+                        error = 0;
+                }
+        } else
+                error = ENOMEM; /* If we got no inode we are out of memory */
+        return error;
+}
+/*
+ * Do the setup for the various locks within the incore inode.
+ */
+void
+xfs_inode_lock_init(
+        xfs_inode_t     *ip,
+        vnode_t         *vp)
+{
+        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                     "xfsino", (long)vp->v_number);
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number);
+        init_waitqueue_head(&ip->i_ipin_wait);
+        atomic_set(&ip->i_pincount, 0);
+        init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number);
+}
+/*
+ * Look for the inode corresponding to the given ino in the hash table.
+ * If it is there and its i_transp pointer matches tp, return it.
+ * Otherwise, return NULL.
+ */
+xfs_inode_t *
+xfs_inode_incore(xfs_mount_t    *mp,
+                 xfs_ino_t      ino,
+                 xfs_trans_t    *tp)
+{
+        xfs_ihash_t     *ih;
+        xfs_inode_t     *ip;
+        ih = XFS_IHASH(mp, ino);
+        read_lock(&ih->ih_lock);
+        for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+                if (ip->i_ino == ino) {
+                        /*
+                         * If we find it and tp matches, return it.
+                         * Otherwise break from the loop and return
+                         * NULL.
+                         */
+                        if (ip->i_transp == tp) {
+                                read_unlock(&ih->ih_lock);
+                                return (ip);
+                        }
+                        break;
+                }
+        }
+        read_unlock(&ih->ih_lock);
+        return (NULL);
+}
+/*
+ * Decrement reference count of an inode structure and unlock it.
+ *
+ * ip -- the inode being released
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be released.  See the comment on xfs_iunlock() for a list
+ *       of valid values.
+ */
+void
+xfs_iput(xfs_inode_t    *ip,
+         uint           lock_flags)
+{
+        vnode_t *vp = XFS_ITOV(ip);
+        vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
+        xfs_iunlock(ip, lock_flags);
+        VN_RELE(vp);
+}
+/*
+ * Special iput for brand-new inodes that are still locked
+ */
+void
+xfs_iput_new(xfs_inode_t        *ip,
+             uint               lock_flags)
+{
+        vnode_t         *vp = XFS_ITOV(ip);
+        struct inode    *inode = LINVFS_GET_IP(vp);
+        vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
+        if ((ip->i_d.di_mode == 0)) {
+                ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE));
+                vn_mark_bad(vp);
+        }
+        if (inode->i_state & I_NEW)
+                unlock_new_inode(inode);
+        if (lock_flags)
+                xfs_iunlock(ip, lock_flags);
+        VN_RELE(vp);
+}
+/*
+ * This routine embodies the part of the reclaim code that pulls
+ * the inode from the inode hash table and the mount structure's
+ * inode list.
+ * This should only be called from xfs_reclaim().
+ */
+void
+xfs_ireclaim(xfs_inode_t *ip)
+{
+        vnode_t         *vp;
+        /*
+         * Remove from old hash list and mount list.
+         */
+        XFS_STATS_INC(xs_ig_reclaims);
+        xfs_iextract(ip);
+        /*
+         * Here we do a spurious inode lock in order to coordinate with
+         * xfs_sync().  This is because xfs_sync() references the inodes
+         * in the mount list without taking references on the corresponding
+         * vnodes.  We make that OK here by ensuring that we wait until
+         * the inode is unlocked in xfs_sync() before we go ahead and
+         * free it.  We get both the regular lock and the io lock because
+         * the xfs_sync() code may need to drop the regular one but will
+         * still hold the io lock.
+         */
+        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        /*
+         * Release dquots (and their references) if any. An inode may escape
+         * xfs_inactive and get here via vn_alloc->vn_reclaim path.
+         */
+        XFS_QM_DQDETACH(ip->i_mount, ip);
+        /*
+         * Pull our behavior descriptor from the vnode chain.
+         */
+        vp = XFS_ITOV_NULL(ip);
+        if (vp) {
+                vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
+        }
+        /*
+         * Free all memory associated with the inode.
+         */
+        xfs_idestroy(ip);
+}
+/*
+ * This routine removes an about-to-be-destroyed inode from
+ * all of the lists in which it is located with the exception
+ * of the behavior chain.
+ */
+void
+xfs_iextract(
+        xfs_inode_t     *ip)
+{
+        xfs_ihash_t     *ih;
+        xfs_inode_t     *iq;
+        xfs_mount_t     *mp;
+        xfs_chash_t     *ch;
+        xfs_chashlist_t *chl, *chm;
+        SPLDECL(s);
+        ih = ip->i_hash;
+        write_lock(&ih->ih_lock);
+        if ((iq = ip->i_next)) {
+                iq->i_prevp = ip->i_prevp;
+        }
+        *ip->i_prevp = iq;
+        write_unlock(&ih->ih_lock);
+        /*
+         * Remove from cluster hash list
+         *   1) delete the chashlist if this is the last inode on the chashlist
+         *   2) unchain from list of inodes
+         *   3) point chashlist->chl_ip to 'chl_next' if to this inode.
+         */
+        mp = ip->i_mount;
+        ch = XFS_CHASH(mp, ip->i_blkno);
+        s = mutex_spinlock(&ch->ch_lock);
+        if (ip->i_cnext == ip) {
+                /* Last inode on chashlist */
+                ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
+                ASSERT(ip->i_chash != NULL);
+                chm=NULL;
+                for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
+                        if (chl->chl_blkno == ip->i_blkno) {
+                                if (chm == NULL) {
+                                        /* first item on the list */
+                                        ch->ch_list = chl->chl_next;
+                                } else {
+                                        chm->chl_next = chl->chl_next;
+                                }
+                                kmem_zone_free(xfs_chashlist_zone, chl);
+                                break;
+                        } else {
+                                ASSERT(chl->chl_ip != ip);
+                                chm = chl;
+                        }
+                }
+                ASSERT_ALWAYS(chl != NULL);
+       } else {
+                /* delete one inode from a non-empty list */
+                iq = ip->i_cnext;
+                iq->i_cprev = ip->i_cprev;
+                ip->i_cprev->i_cnext = iq;
+                if (ip->i_chash->chl_ip == ip) {
+                        ip->i_chash->chl_ip = iq;
+                }
+                ip->i_chash = __return_address;
+                ip->i_cprev = __return_address;
+                ip->i_cnext = __return_address;
+        }
+        mutex_spinunlock(&ch->ch_lock, s);
+        /*
+         * Remove from mount's inode list.
+         */
+        XFS_MOUNT_ILOCK(mp);
+        ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
+        iq = ip->i_mnext;
+        iq->i_mprev = ip->i_mprev;
+        ip->i_mprev->i_mnext = iq;
+        /*
+         * Fix up the head pointer if it points to the inode being deleted.
+         */
+        if (mp->m_inodes == ip) {
+                if (ip == iq) {
+                        mp->m_inodes = NULL;
+                } else {
+                        mp->m_inodes = iq;
+                }
+        }
+        /* Deal with the deleted inodes list */
+        list_del_init(&ip->i_reclaim);
+        mp->m_ireclaims++;
+        XFS_MOUNT_IUNLOCK(mp);
+}
+/*
+ * This is a wrapper routine around the xfs_ilock() routine
+ * used to centralize some grungy code.  It is used in places
+ * that wish to lock the inode solely for reading the extents.
+ * The reason these places can't just call xfs_ilock(SHARED)
+ * is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode
+ * is in b-tree format, then we need to lock the inode exclusively
+ * until the extents are read in.  Locking it exclusively all
+ * the time would limit our parallelism unnecessarily, though.
+ * What we do instead is check to see if the extents have been
+ * read in yet, and only lock the inode exclusively if they
+ * have not.
+ *
+ * The function returns a value which should be given to the
+ * corresponding xfs_iunlock_map_shared().  This value is
+ * the mode in which the lock was actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+        xfs_inode_t     *ip)
+{
+        uint    lock_mode;
+        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+                lock_mode = XFS_ILOCK_EXCL;
+        } else {
+                lock_mode = XFS_ILOCK_SHARED;
+        }
+        xfs_ilock(ip, lock_mode);
+        return lock_mode;
+}
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+        xfs_inode_t     *ip,
+        unsigned int    lock_mode)
+{
+        xfs_iunlock(ip, lock_mode);
+}
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *              XFS_IOLOCK_SHARED,
+ *              XFS_IOLOCK_EXCL,
+ *              XFS_ILOCK_SHARED,
+ *              XFS_ILOCK_EXCL,
+ *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(xfs_inode_t   *ip,
+          uint          lock_flags)
+{
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+        if (lock_flags & XFS_IOLOCK_EXCL) {
+                mrupdate(&ip->i_iolock);
+        } else if (lock_flags & XFS_IOLOCK_SHARED) {
+                mraccess(&ip->i_iolock);
+        }
+        if (lock_flags & XFS_ILOCK_EXCL) {
+                mrupdate(&ip->i_lock);
+        } else if (lock_flags & XFS_ILOCK_SHARED) {
+                mraccess(&ip->i_lock);
+        }
+        xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address);
+}
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *       of valid values.
+ *
+ */
+int
+xfs_ilock_nowait(xfs_inode_t    *ip,
+                 uint           lock_flags)
+{
+        int     iolocked;
+        int     ilocked;
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+        iolocked = 0;
+        if (lock_flags & XFS_IOLOCK_EXCL) {
+                iolocked = mrtryupdate(&ip->i_iolock);
+                if (!iolocked) {
+                        return 0;
+                }
+        } else if (lock_flags & XFS_IOLOCK_SHARED) {
+                iolocked = mrtryaccess(&ip->i_iolock);
+                if (!iolocked) {
+                        return 0;
+                }
+        }
+        if (lock_flags & XFS_ILOCK_EXCL) {
+                ilocked = mrtryupdate(&ip->i_lock);
+                if (!ilocked) {
+                        if (iolocked) {
+                                mrunlock(&ip->i_iolock);
+                        }
+                        return 0;
+                }
+        } else if (lock_flags & XFS_ILOCK_SHARED) {
+                ilocked = mrtryaccess(&ip->i_lock);
+                if (!ilocked) {
+                        if (iolocked) {
+                                mrunlock(&ip->i_iolock);
+                        }
+                        return 0;
+                }
+        }
+        xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
+        return 1;
+}
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *       of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(xfs_inode_t *ip,
+            uint        lock_flags)
+{
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0);
+        ASSERT(lock_flags != 0);
+        if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
+                ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) ||
+                       (ismrlocked(&ip->i_iolock, MR_ACCESS)));
+                ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) ||
+                       (ismrlocked(&ip->i_iolock, MR_UPDATE)));
+                mrunlock(&ip->i_iolock);
+        }
+        if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) {
+                ASSERT(!(lock_flags & XFS_ILOCK_SHARED) ||
+                       (ismrlocked(&ip->i_lock, MR_ACCESS)));
+                ASSERT(!(lock_flags & XFS_ILOCK_EXCL) ||
+                       (ismrlocked(&ip->i_lock, MR_UPDATE)));
+                mrunlock(&ip->i_lock);
+                /*
+                 * Let the AIL know that this item has been unlocked in case
+                 * it is in the AIL and anyone is waiting on it.  Don't do
+                 * this if the caller has asked us not to.
+                 */
+                if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) &&
+                     ip->i_itemp != NULL) {
+                        xfs_trans_unlocked_item(ip->i_mount,
+                                                (xfs_log_item_t*)(ip->i_itemp));
+                }
+        }
+        xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
+}
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(xfs_inode_t    *ip,
+                 uint           lock_flags)
+{
+        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+        if (lock_flags & XFS_ILOCK_EXCL) {
+                ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+                mrdemote(&ip->i_lock);
+        }
+        if (lock_flags & XFS_IOLOCK_EXCL) {
+                ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+                mrdemote(&ip->i_iolock);
+        }
+}
+/*
+ * The following three routines simply manage the i_flock
+ * semaphore embedded in the inode.  This semaphore synchronizes
+ * processes attempting to flush the in-core inode back to disk.
+ */
+void
+xfs_iflock(xfs_inode_t *ip)
+{
+        psema(&(ip->i_flock), PINOD|PLTWAIT);
+}
+int
+xfs_iflock_nowait(xfs_inode_t *ip)
+{
+        return (cpsema(&(ip->i_flock)));
+}
+void
+xfs_ifunlock(xfs_inode_t *ip)
+{
+        ASSERT(valusema(&(ip->i_flock)) <= 0);
+        vsema(&(ip->i_flock));
+}
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
new file mode 100644
index 000000000000..e385064a066a
--- /dev/null
+++ b/fs/xfs/xfs_imap.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IMAP_H__
+#define __XFS_IMAP_H__
+/*
+ * This is the structure passed to xfs_imap() to map
+ * an inode number to its on disk location.
+ */
+typedef struct xfs_imap {
+        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+        uint            im_len;         /* length in BBs of inode chunk */
+        xfs_agblock_t   im_agblkno;     /* logical block of inode chunk in ag */
+        ushort          im_ioffset;     /* inode offset in block in "inodes" */
+        ushort          im_boffset;     /* inode offset in block in bytes */
+} xfs_imap_t;
+#ifdef __KERNEL__
+struct xfs_mount;
+struct xfs_trans;
+int     xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+                 xfs_imap_t *, uint);
+#endif
+#endif  /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
new file mode 100644
index 000000000000..43c632ab86ad
--- /dev/null
+++ b/fs/xfs/xfs_inode.c
@@ -0,0 +1,3876 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_imap.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_rw.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_utils.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_quota.h"
+#include "xfs_mac.h"
+#include "xfs_acl.h"
+kmem_zone_t *xfs_ifork_zone;
+kmem_zone_t *xfs_inode_zone;
+kmem_zone_t *xfs_chashlist_zone;
+/*
+ * Used in xfs_itruncate().  This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define XFS_ITRUNC_MAX_EXTENTS  2
+STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+STATIC void
+xfs_validate_extents(
+        xfs_bmbt_rec_t          *ep,
+        int                     nrecs,
+        int                     disk,
+        xfs_exntfmt_t           fmt)
+{
+        xfs_bmbt_irec_t         irec;
+        xfs_bmbt_rec_t          rec;
+        int                     i;
+        for (i = 0; i < nrecs; i++) {
+                rec.l0 = get_unaligned((__uint64_t*)&ep->l0);
+                rec.l1 = get_unaligned((__uint64_t*)&ep->l1);
+                if (disk)
+                        xfs_bmbt_disk_get_all(&rec, &irec);
+                else
+                        xfs_bmbt_get_all(&rec, &irec);
+                if (fmt == XFS_EXTFMT_NOSTATE)
+                        ASSERT(irec.br_state == XFS_EXT_NORM);
+                ep++;
+        }
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ep, nrecs, disk, fmt)
+#endif /* DEBUG */
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+        xfs_mount_t     *mp,
+        xfs_buf_t       *bp)
+{
+        int             i;
+        int             j;
+        xfs_dinode_t    *dip;
+        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+        for (i = 0; i < j; i++) {
+                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                        i * mp->m_sb.sb_inodesize);
+                if (!dip->di_next_unlinked)  {
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
+                                bp);
+                        ASSERT(dip->di_next_unlinked);
+                }
+        }
+}
+#endif
+/*
+ * called from bwrite on xfs inode buffers
+ */
+void
+xfs_inobp_bwcheck(xfs_buf_t *bp)
+{
+        xfs_mount_t     *mp;
+        int             i;
+        int             j;
+        xfs_dinode_t    *dip;
+        ASSERT(XFS_BUF_FSPRIVATE3(bp, void *) != NULL);
+        mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
+        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+        for (i = 0; i < j; i++)  {
+                dip = (xfs_dinode_t *) xfs_buf_offset(bp,
+                                                i * mp->m_sb.sb_inodesize);
+                if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+                        cmn_err(CE_WARN,
+"Bad magic # 0x%x in XFS inode buffer 0x%Lx, starting blockno %Ld, offset 0x%x",
+                                INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
+                                (__uint64_t)(__psunsigned_t) bp,
+                                (__int64_t) XFS_BUF_ADDR(bp),
+                                xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+                        xfs_fs_cmn_err(CE_WARN, mp,
+                                "corrupt, unmount and run xfs_repair");
+                }
+                if (!dip->di_next_unlinked)  {
+                        cmn_err(CE_WARN,
+"Bad next_unlinked field (0) in XFS inode buffer 0x%p, starting blockno %Ld, offset 0x%x",
+                                (__uint64_t)(__psunsigned_t) bp,
+                                (__int64_t) XFS_BUF_ADDR(bp),
+                                xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+                        xfs_fs_cmn_err(CE_WARN, mp,
+                                "corrupt, unmount and run xfs_repair");
+                }
+        }
+        return;
+}
+/*
+ * This routine is called to map an inode number within a file
+ * system to the buffer containing the on-disk version of the
+ * inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dip parameter
+ * it returns a pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * Use xfs_imap() to determine the size and location of the
+ * buffer to read from disk.
+ */
+int
+xfs_inotobp(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        xfs_dinode_t    **dipp,
+        xfs_buf_t       **bpp,
+        int             *offset)
+{
+        int             di_ok;
+        xfs_imap_t      imap;
+        xfs_buf_t       *bp;
+        int             error;
+        xfs_dinode_t    *dip;
+        /*
+         * Call the space managment code to find the location of the
+         * inode on disk.
+         */
+        imap.im_blkno = 0;
+        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+        if (error != 0) {
+                cmn_err(CE_WARN,
+        "xfs_inotobp: xfs_imap()  returned an "
+        "error %d on %s.  Returning error.", error, mp->m_fsname);
+                return error;
+        }
+        /*
+         * If the inode number maps to a block outside the bounds of the
+         * file system then return NULL rather than calling read_buf
+         * and panicing when we get an error from the driver.
+         */
+        if ((imap.im_blkno + imap.im_len) >
+            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+                cmn_err(CE_WARN,
+        "xfs_inotobp: inode number (%d + %d) maps to a block outside the bounds "
+        "of the file system %s.  Returning EINVAL.",
+                        imap.im_blkno, imap.im_len,mp->m_fsname);
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
+         * default to just a read_buf() call.
+         */
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
+                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
+        if (error) {
+                cmn_err(CE_WARN,
+        "xfs_inotobp: xfs_trans_read_buf()  returned an "
+        "error %d on %s.  Returning error.", error, mp->m_fsname);
+                return error;
+        }
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
+        di_ok =
+                INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
+                XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
+        if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
+                        XFS_RANDOM_ITOBP_INOTOBP))) {
+                XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
+                xfs_trans_brelse(tp, bp);
+                cmn_err(CE_WARN,
+        "xfs_inotobp: XFS_TEST_ERROR()  returned an "
+        "error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        xfs_inobp_check(mp, bp);
+        /*
+         * Set *dipp to point to the on-disk inode in the buffer.
+         */
+        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        *bpp = bp;
+        *offset = imap.im_boffset;
+        return 0;
+}
+/*
+ * This routine is called to map an inode to the buffer containing
+ * the on-disk version of the inode.  It returns a pointer to the
+ * buffer containing the on-disk inode in the bpp parameter, and in
+ * the dip parameter it returns a pointer to the on-disk inode within
+ * that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * If the inode is new and has not yet been initialized, use xfs_imap()
+ * to determine the size and location of the buffer to read from disk.
+ * If the inode has already been mapped to its buffer and read in once,
+ * then use the mapping information stored in the inode rather than
+ * calling xfs_imap().  This allows us to avoid the overhead of looking
+ * at the inode btree for small block file systems (see xfs_dilocate()).
+ * We can tell whether the inode has been mapped in before by comparing
+ * its disk block address to 0.  Only uninitialized inodes will have
+ * 0 for the disk block address.
+ */
+int
+xfs_itobp(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        xfs_dinode_t    **dipp,
+        xfs_buf_t       **bpp,
+        xfs_daddr_t     bno)
+{
+        xfs_buf_t       *bp;
+        int             error;
+        xfs_imap_t      imap;
+#ifdef __KERNEL__
+        int             i;
+        int             ni;
+#endif
+        if (ip->i_blkno == (xfs_daddr_t)0) {
+                /*
+                 * Call the space management code to find the location of the
+                 * inode on disk.
+                 */
+                imap.im_blkno = bno;
+                error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP);
+                if (error != 0) {
+                        return error;
+                }
+                /*
+                 * If the inode number maps to a block outside the bounds
+                 * of the file system then return NULL rather than calling
+                 * read_buf and panicing when we get an error from the
+                 * driver.
+                 */
+                if ((imap.im_blkno + imap.im_len) >
+                    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
+                                        "(imap.im_blkno (0x%llx) "
+                                        "+ imap.im_len (0x%llx)) > "
+                                        " XFS_FSB_TO_BB(mp, "
+                                        "mp->m_sb.sb_dblocks) (0x%llx)",
+                                        (unsigned long long) imap.im_blkno,
+                                        (unsigned long long) imap.im_len,
+                                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+#endif /* DEBUG */
+                        return XFS_ERROR(EINVAL);
+                }
+                /*
+                 * Fill in the fields in the inode that will be used to
+                 * map the inode to its buffer from now on.
+                 */
+                ip->i_blkno = imap.im_blkno;
+                ip->i_len = imap.im_len;
+                ip->i_boffset = imap.im_boffset;
+        } else {
+                /*
+                 * We've already mapped the inode once, so just use the
+                 * mapping that we saved the first time.
+                 */
+                imap.im_blkno = ip->i_blkno;
+                imap.im_len = ip->i_len;
+                imap.im_boffset = ip->i_boffset;
+        }
+        ASSERT(bno == 0 || bno == imap.im_blkno);
+        /*
+         * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
+         * default to just a read_buf() call.
+         */
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
+                                   (int)imap.im_len, XFS_BUF_LOCK, &bp);
+        if (error) {
+#ifdef DEBUG
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
+                                "xfs_trans_read_buf() returned error %d, "
+                                "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
+                                error, (unsigned long long) imap.im_blkno,
+                                (unsigned long long) imap.im_len);
+#endif /* DEBUG */
+                return error;
+        }
+#ifdef __KERNEL__
+        /*
+         * Validate the magic number and version of every inode in the buffer
+         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+         */
+#ifdef DEBUG
+        ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
+#else
+        ni = 1;
+#endif
+        for (i = 0; i < ni; i++) {
+                int             di_ok;
+                xfs_dinode_t    *dip;
+                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                        (i << mp->m_sb.sb_inodelog));
+                di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
+                            XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
+                if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
+                                 XFS_RANDOM_ITOBP_INOTOBP))) {
+#ifdef DEBUG
+                        prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                                mp->m_ddev_targp,
+                                (unsigned long long)imap.im_blkno, i,
+                                INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
+#endif
+                        XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
+                                             mp, dip);
+                        xfs_trans_brelse(tp, bp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+        }
+#endif  /* __KERNEL__ */
+        xfs_inobp_check(mp, bp);
+        /*
+         * Mark the buffer as an inode buffer now that it looks good
+         */
+        XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+        /*
+         * Set *dipp to point to the on-disk inode in the buffer.
+         */
+        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        *bpp = bp;
+        return 0;
+}
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.  For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.  For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+STATIC int
+xfs_iformat(
+        xfs_inode_t             *ip,
+        xfs_dinode_t            *dip)
+{
+        xfs_attr_shortform_t    *atp;
+        int                     size;
+        int                     error;
+        xfs_fsize_t             di_size;
+        ip->i_df.if_ext_max =
+                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+        error = 0;
+        if (unlikely(
+            INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
+                INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
+            INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu."
+                        "  Unmount and run xfs_repair.",
+                        (unsigned long long)ip->i_ino,
+                        (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
+                            + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
+                        (unsigned long long)
+                        INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT));
+                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt dinode %Lu, forkoff = 0x%x."
+                        "  Unmount and run xfs_repair.",
+                        (unsigned long long)ip->i_ino,
+                        (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
+                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFIFO:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFSOCK:
+                if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) {
+                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
+                                              ip->i_mount, dip);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                ip->i_d.di_size = 0;
+                ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
+                break;
+        case S_IFREG:
+        case S_IFLNK:
+        case S_IFDIR:
+                switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
+                case XFS_DINODE_FMT_LOCAL:
+                        /*
+                         * no local regular files yet
+                         */
+                        if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) {
+                                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                                        "corrupt inode (local format for regular file) %Lu.  Unmount and run xfs_repair.",
+                                        (unsigned long long) ip->i_ino);
+                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
+                                                     XFS_ERRLEVEL_LOW,
+                                                     ip->i_mount, dip);
+                                return XFS_ERROR(EFSCORRUPTED);
+                        }
+                        di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
+                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
+                                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                                        "corrupt inode %Lu (bad size %Ld for local inode).  Unmount and run xfs_repair.",
+                                        (unsigned long long) ip->i_ino,
+                                        (long long) di_size);
+                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
+                                                     XFS_ERRLEVEL_LOW,
+                                                     ip->i_mount, dip);
+                                return XFS_ERROR(EFSCORRUPTED);
+                        }
+                        size = (int)di_size;
+                        error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+                        break;
+                case XFS_DINODE_FMT_EXTENTS:
+                        error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+                        break;
+                case XFS_DINODE_FMT_BTREE:
+                        error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+                        break;
+                default:
+                        XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
+                                         ip->i_mount);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                break;
+        default:
+                XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (error) {
+                return error;
+        }
+        if (!XFS_DFORK_Q(dip))
+                return 0;
+        ASSERT(ip->i_afp == NULL);
+        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+        ip->i_afp->if_ext_max =
+                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+        switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) {
+        case XFS_DINODE_FMT_LOCAL:
+                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
+                size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT);
+                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+                error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+                break;
+        default:
+                error = XFS_ERROR(EFSCORRUPTED);
+                break;
+        }
+        if (error) {
+                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+                ip->i_afp = NULL;
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+        }
+        return error;
+}
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+        xfs_inode_t     *ip,
+        xfs_dinode_t    *dip,
+        int             whichfork,
+        int             size)
+{
+        xfs_ifork_t     *ifp;
+        int             real_size;
+        /*
+         * If the size is unreasonable, then something
+         * is wrong and we just bail out rather than crash in
+         * kmem_alloc() or memcpy() below.
+         */
+        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt inode %Lu (bad size %d for local fork, size = %d).  Unmount and run xfs_repair.",
+                        (unsigned long long) ip->i_ino, size,
+                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
+                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        real_size = 0;
+        if (size == 0)
+                ifp->if_u1.if_data = NULL;
+        else if (size <= sizeof(ifp->if_u2.if_inline_data))
+                ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+        else {
+                real_size = roundup(size, 4);
+                ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+        }
+        ifp->if_bytes = size;
+        ifp->if_real_bytes = real_size;
+        if (size)
+                memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
+        ifp->if_flags &= ~XFS_IFEXTENTS;
+        ifp->if_flags |= XFS_IFINLINE;
+        return 0;
+}
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+        xfs_inode_t     *ip,
+        xfs_dinode_t    *dip,
+        int             whichfork)
+{
+        xfs_bmbt_rec_t  *ep, *dp;
+        xfs_ifork_t     *ifp;
+        int             nex;
+        int             real_size;
+        int             size;
+        int             i;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+        size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+        /*
+         * If the number of extents is unreasonable, then something
+         * is wrong and we just bail out rather than crash in
+         * kmem_alloc() or memcpy() below.
+         */
+        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt inode %Lu ((a)extents = %d).  Unmount and run xfs_repair.",
+                        (unsigned long long) ip->i_ino, nex);
+                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
+                                     ip->i_mount, dip);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        real_size = 0;
+        if (nex == 0)
+                ifp->if_u1.if_extents = NULL;
+        else if (nex <= XFS_INLINE_EXTS)
+                ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+        else {
+                ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
+                ASSERT(ifp->if_u1.if_extents != NULL);
+                real_size = size;
+        }
+        ifp->if_bytes = size;
+        ifp->if_real_bytes = real_size;
+        if (size) {
+                dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
+                xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip));
+                ep = ifp->if_u1.if_extents;
+                for (i = 0; i < nex; i++, ep++, dp++) {
+                        ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0),
+                                                                ARCH_CONVERT);
+                        ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1),
+                                                                ARCH_CONVERT);
+                }
+                xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex,
+                        whichfork);
+                if (whichfork != XFS_DATA_FORK ||
+                        XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+                                if (unlikely(xfs_check_nostate_extents(
+                                    ifp->if_u1.if_extents, nex))) {
+                                        XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+                                                         XFS_ERRLEVEL_LOW,
+                                                         ip->i_mount);
+                                        return XFS_ERROR(EFSCORRUPTED);
+                                }
+        }
+        ifp->if_flags |= XFS_IFEXTENTS;
+        return 0;
+}
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+        xfs_inode_t             *ip,
+        xfs_dinode_t            *dip,
+        int                     whichfork)
+{
+        xfs_bmdr_block_t        *dfp;
+        xfs_ifork_t             *ifp;
+        /* REFERENCED */
+        int                     nrecs;
+        int                     size;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+        size = XFS_BMAP_BROOT_SPACE(dfp);
+        nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+        /*
+         * blow out if -- fork has less extents than can fit in
+         * fork (fork shouldn't be a btree format), root btree
+         * block has more records than can fit into the fork,
+         * or the number of extents is greater than the number of
+         * blocks.
+         */
+        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
+            || XFS_BMDR_SPACE_CALC(nrecs) >
+                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
+            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt inode %Lu (btree).  Unmount and run xfs_repair.",
+                        (unsigned long long) ip->i_ino);
+                XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                                 ip->i_mount);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        ifp->if_broot_bytes = size;
+        ifp->if_broot = kmem_alloc(size, KM_SLEEP);
+        ASSERT(ifp->if_broot != NULL);
+        /*
+         * Copy and convert from the on-disk structure
+         * to the in-memory structure.
+         */
+        xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                ifp->if_broot, size);
+        ifp->if_flags &= ~XFS_IFEXTENTS;
+        ifp->if_flags |= XFS_IFBROOT;
+        return 0;
+}
+/*
+ * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk
+ * and native format
+ *
+ * buf  = on-disk representation
+ * dip  = native representation
+ * dir  = direction - +ve -> disk to native
+ *                    -ve -> native to disk
+ */
+void
+xfs_xlate_dinode_core(
+        xfs_caddr_t             buf,
+        xfs_dinode_core_t       *dip,
+        int                     dir)
+{
+        xfs_dinode_core_t       *buf_core = (xfs_dinode_core_t *)buf;
+        xfs_dinode_core_t       *mem_core = (xfs_dinode_core_t *)dip;
+        xfs_arch_t              arch = ARCH_CONVERT;
+        ASSERT(dir);
+        INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch);
+        INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch);
+        INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch);
+        INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch);
+        INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch);
+        INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch);
+        INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch);
+        INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch);
+        INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch);
+        if (dir > 0) {
+                memcpy(mem_core->di_pad, buf_core->di_pad,
+                        sizeof(buf_core->di_pad));
+        } else {
+                memcpy(buf_core->di_pad, mem_core->di_pad,
+                        sizeof(buf_core->di_pad));
+        }
+        INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch);
+        INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec,
+                        dir, arch);
+        INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec,
+                        dir, arch);
+        INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec,
+                        dir, arch);
+        INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec,
+                        dir, arch);
+        INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec,
+                        dir, arch);
+        INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec,
+                        dir, arch);
+        INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch);
+        INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch);
+        INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch);
+        INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch);
+        INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch);
+        INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch);
+        INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch);
+        INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch);
+        INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch);
+        INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch);
+        INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch);
+}
+STATIC uint
+_xfs_dic2xflags(
+        xfs_dinode_core_t       *dic,
+        __uint16_t              di_flags)
+{
+        uint                    flags = 0;
+        if (di_flags & XFS_DIFLAG_ANY) {
+                if (di_flags & XFS_DIFLAG_REALTIME)
+                        flags |= XFS_XFLAG_REALTIME;
+                if (di_flags & XFS_DIFLAG_PREALLOC)
+                        flags |= XFS_XFLAG_PREALLOC;
+                if (di_flags & XFS_DIFLAG_IMMUTABLE)
+                        flags |= XFS_XFLAG_IMMUTABLE;
+                if (di_flags & XFS_DIFLAG_APPEND)
+                        flags |= XFS_XFLAG_APPEND;
+                if (di_flags & XFS_DIFLAG_SYNC)
+                        flags |= XFS_XFLAG_SYNC;
+                if (di_flags & XFS_DIFLAG_NOATIME)
+                        flags |= XFS_XFLAG_NOATIME;
+                if (di_flags & XFS_DIFLAG_NODUMP)
+                        flags |= XFS_XFLAG_NODUMP;
+                if (di_flags & XFS_DIFLAG_RTINHERIT)
+                        flags |= XFS_XFLAG_RTINHERIT;
+                if (di_flags & XFS_DIFLAG_PROJINHERIT)
+                        flags |= XFS_XFLAG_PROJINHERIT;
+                if (di_flags & XFS_DIFLAG_NOSYMLINKS)
+                        flags |= XFS_XFLAG_NOSYMLINKS;
+        }
+        return flags;
+}
+uint
+xfs_ip2xflags(
+        xfs_inode_t             *ip)
+{
+        xfs_dinode_core_t       *dic = &ip->i_d;
+        return _xfs_dic2xflags(dic, dic->di_flags) |
+                (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
+}
+uint
+xfs_dic2xflags(
+        xfs_dinode_core_t       *dic)
+{
+        return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) |
+                (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
+}
+/*
+ * Given a mount structure and an inode number, return a pointer
+ * to a newly allocated in-core inode coresponding to the given
+ * inode number.
+ *
+ * Initialize the inode's attributes and extent pointers if it
+ * already has them (it will not if the inode has no links).
+ */
+int
+xfs_iread(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        xfs_inode_t     **ipp,
+        xfs_daddr_t     bno)
+{
+        xfs_buf_t       *bp;
+        xfs_dinode_t    *dip;
+        xfs_inode_t     *ip;
+        int             error;
+        ASSERT(xfs_inode_zone != NULL);
+        ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
+        ip->i_ino = ino;
+        ip->i_mount = mp;
+        /*
+         * Get pointer's to the on-disk inode and the buffer containing it.
+         * If the inode number refers to a block outside the file system
+         * then xfs_itobp() will return NULL.  In this case we should
+         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
+         * know that this is a new incore inode.
+         */
+        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno);
+        if (error != 0) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return error;
+        }
+        /*
+         * Initialize inode's trace buffers.
+         * Do this before xfs_iformat in case it adds entries.
+         */
+#ifdef XFS_BMAP_TRACE
+        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMBT_TRACE
+        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_RW_TRACE
+        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+#endif
+        /*
+         * If we got something that isn't an inode it means someone
+         * (nfs or dmi) has a stale handle.
+         */
+        if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                xfs_trans_brelse(tp, bp);
+#ifdef DEBUG
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                                "dip->di_core.di_magic (0x%x) != "
+                                "XFS_DINODE_MAGIC (0x%x)",
+                                INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
+                                XFS_DINODE_MAGIC);
+#endif /* DEBUG */
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * If the on-disk inode is already linked to a directory
+         * entry, copy all of the inode into the in-core inode.
+         * xfs_iformat() handles copying in the inode format
+         * specific information.
+         * Otherwise, just get the truly permanent information.
+         */
+        if (dip->di_core.di_mode) {
+                xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
+                     &(ip->i_d), 1);
+                error = xfs_iformat(ip, dip);
+                if (error)  {
+                        kmem_zone_free(xfs_inode_zone, ip);
+                        xfs_trans_brelse(tp, bp);
+#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                                        "xfs_iformat() returned error %d",
+                                        error);
+#endif /* DEBUG */
+                        return error;
+                }
+        } else {
+                ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT);
+                ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT);
+                ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT);
+                ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT);
+                /*
+                 * Make sure to pull in the mode here as well in
+                 * case the inode is released without being used.
+                 * This ensures that xfs_inactive() will see that
+                 * the inode is already free and not try to mess
+                 * with the uninitialized part of it.
+                 */
+                ip->i_d.di_mode = 0;
+                /*
+                 * Initialize the per-fork minima and maxima for a new
+                 * inode here.  xfs_iformat will do it for old inodes.
+                 */
+                ip->i_df.if_ext_max =
+                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+        }
+        INIT_LIST_HEAD(&ip->i_reclaim);
+        /*
+         * The inode format changed when we moved the link count and
+         * made it 32 bits long.  If this is an old format inode,
+         * convert it in memory to look like a new one.  If it gets
+         * flushed to disk we will convert back before flushing or
+         * logging it.  We zero out the new projid field and the old link
+         * count field.  We'll handle clearing the pad field (the remains
+         * of the old uuid field) when we actually convert the inode to
+         * the new format. We don't change the version number so that we
+         * can distinguish this from a real new format inode.
+         */
+        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+                ip->i_d.di_nlink = ip->i_d.di_onlink;
+                ip->i_d.di_onlink = 0;
+                ip->i_d.di_projid = 0;
+        }
+        ip->i_delayed_blks = 0;
+        /*
+         * Mark the buffer containing the inode as something to keep
+         * around for a while.  This helps to keep recently accessed
+         * meta-data in-core longer.
+         */
+         XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        /*
+         * Use xfs_trans_brelse() to release the buffer containing the
+         * on-disk inode, because it was acquired with xfs_trans_read_buf()
+         * in xfs_itobp() above.  If tp is NULL, this is just a normal
+         * brelse().  If we're within a transaction, then xfs_trans_brelse()
+         * will only release the buffer if it is not dirty within the
+         * transaction.  It will be OK to release the buffer in this case,
+         * because inodes on disk are never destroyed and we will be
+         * locking the new in-core inode before putting it in the hash
+         * table where other processes can find it.  Thus we don't have
+         * to worry about the inode being changed just because we released
+         * the buffer.
+         */
+        xfs_trans_brelse(tp, bp);
+        *ipp = ip;
+        return 0;
+}
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        int             whichfork)
+{
+        int             error;
+        xfs_ifork_t     *ifp;
+        size_t          size;
+        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
+                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
+                                 ip->i_mount);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        /*
+         * We know that the size is valid (it's checked in iformat_btree)
+         */
+        ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
+        ASSERT(ifp->if_u1.if_extents != NULL);
+        ifp->if_lastex = NULLEXTNUM;
+        ifp->if_bytes = ifp->if_real_bytes = (int)size;
+        ifp->if_flags |= XFS_IFEXTENTS;
+        error = xfs_bmap_read_extents(tp, ip, whichfork);
+        if (error) {
+                kmem_free(ifp->if_u1.if_extents, size);
+                ifp->if_u1.if_extents = NULL;
+                ifp->if_bytes = ifp->if_real_bytes = 0;
+                ifp->if_flags &= ~XFS_IFEXTENTS;
+                return error;
+        }
+        xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents,
+                XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip));
+        return 0;
+}
+/*
+ * Allocate an inode on disk and return a copy of its in-core version.
+ * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
+ * appropriately within the inode.  The uid and gid for the inode are
+ * set according to the contents of the given cred structure.
+ *
+ * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
+ * has a free inode available, call xfs_iget()
+ * to obtain the in-core version of the allocated inode.  Finally,
+ * fill in the inode and log its initial contents.  In this case,
+ * ialloc_context would be set to NULL and call_again set to false.
+ *
+ * If xfs_dialloc() does not have an available inode,
+ * it will replenish its supply by doing an allocation. Since we can
+ * only do one allocation within a transaction without deadlocks, we
+ * must commit the current transaction before returning the inode itself.
+ * In this case, therefore, we will set call_again to true and return.
+ * The caller should then commit the current transaction, start a new
+ * transaction, and call xfs_ialloc() again to actually get the inode.
+ *
+ * To ensure that some other process does not grab the inode that
+ * was allocated during the first call to xfs_ialloc(), this routine
+ * also returns the [locked] bp pointing to the head of the freelist
+ * as ialloc_context.  The caller should hold this buffer across
+ * the commit and pass it back into this routine on the second call.
+ */
+int
+xfs_ialloc(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *pip,
+        mode_t          mode,
+        nlink_t         nlink,
+        xfs_dev_t       rdev,
+        cred_t          *cr,
+        xfs_prid_t      prid,
+        int             okalloc,
+        xfs_buf_t       **ialloc_context,
+        boolean_t       *call_again,
+        xfs_inode_t     **ipp)
+{
+        xfs_ino_t       ino;
+        xfs_inode_t     *ip;
+        vnode_t         *vp;
+        uint            flags;
+        int             error;
+        /*
+         * Call the space management code to pick
+         * the on-disk inode to be allocated.
+         */
+        error = xfs_dialloc(tp, pip->i_ino, mode, okalloc,
+                            ialloc_context, call_again, &ino);
+        if (error != 0) {
+                return error;
+        }
+        if (*call_again || ino == NULLFSINO) {
+                *ipp = NULL;
+                return 0;
+        }
+        ASSERT(*ialloc_context == NULL);
+        /*
+         * Get the in-core inode with the lock held exclusively.
+         * This is because we're setting fields here we need
+         * to prevent others from looking at until we're done.
+         */
+        error = xfs_trans_iget(tp->t_mountp, tp, ino,
+                        IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+        if (error != 0) {
+                return error;
+        }
+        ASSERT(ip != NULL);
+        vp = XFS_ITOV(ip);
+        vp->v_type = IFTOVT(mode);
+        ip->i_d.di_mode = (__uint16_t)mode;
+        ip->i_d.di_onlink = 0;
+        ip->i_d.di_nlink = nlink;
+        ASSERT(ip->i_d.di_nlink == nlink);
+        ip->i_d.di_uid = current_fsuid(cr);
+        ip->i_d.di_gid = current_fsgid(cr);
+        ip->i_d.di_projid = prid;
+        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+        /*
+         * If the superblock version is up to where we support new format
+         * inodes and this is currently an old format inode, then change
+         * the inode version number now.  This way we only do the conversion
+         * here rather than here and in the flush/logging code.
+         */
+        if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
+            ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+                ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                /*
+                 * We've already zeroed the old link count, the projid field,
+                 * and the pad field.
+                 */
+        }
+        /*
+         * Project ids won't be stored on disk if we are using a version 1 inode.
+         */
+        if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+                xfs_bump_ino_vers2(tp, ip);
+        if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
+                ip->i_d.di_gid = pip->i_d.di_gid;
+                if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
+                        ip->i_d.di_mode |= S_ISGID;
+                }
+        }
+        /*
+         * If the group ID of the new file does not match the effective group
+         * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
+         * (and only if the irix_sgid_inherit compatibility variable is set).
+         */
+        if ((irix_sgid_inherit) &&
+            (ip->i_d.di_mode & S_ISGID) &&
+            (!in_group_p((gid_t)ip->i_d.di_gid))) {
+                ip->i_d.di_mode &= ~S_ISGID;
+        }
+        ip->i_d.di_size = 0;
+        ip->i_d.di_nextents = 0;
+        ASSERT(ip->i_d.di_nblocks == 0);
+        xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+        /*
+         * di_gen will have been taken care of in xfs_iread.
+         */
+        ip->i_d.di_extsize = 0;
+        ip->i_d.di_dmevmask = 0;
+        ip->i_d.di_dmstate = 0;
+        ip->i_d.di_flags = 0;
+        flags = XFS_ILOG_CORE;
+        switch (mode & S_IFMT) {
+        case S_IFIFO:
+        case S_IFCHR:
+        case S_IFBLK:
+        case S_IFSOCK:
+                ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+                ip->i_df.if_u2.if_rdev = rdev;
+                ip->i_df.if_flags = 0;
+                flags |= XFS_ILOG_DEV;
+                break;
+        case S_IFREG:
+        case S_IFDIR:
+                if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
+                        if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) {
+                                if ((mode & S_IFMT) == S_IFDIR) {
+                                        ip->i_d.di_flags |= XFS_DIFLAG_RTINHERIT;
+                                } else {
+                                        ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
+                                        ip->i_iocore.io_flags |= XFS_IOCORE_RT;
+                                }
+                        }
+                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
+                            xfs_inherit_noatime)
+                                ip->i_d.di_flags |= XFS_DIFLAG_NOATIME;
+                        if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
+                            xfs_inherit_nodump)
+                                ip->i_d.di_flags |= XFS_DIFLAG_NODUMP;
+                        if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
+                            xfs_inherit_sync)
+                                ip->i_d.di_flags |= XFS_DIFLAG_SYNC;
+                        if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
+                            xfs_inherit_nosymlinks)
+                                ip->i_d.di_flags |= XFS_DIFLAG_NOSYMLINKS;
+                }
+                /* FALLTHROUGH */
+        case S_IFLNK:
+                ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+                ip->i_df.if_flags = XFS_IFEXTENTS;
+                ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
+                ip->i_df.if_u1.if_extents = NULL;
+                break;
+        default:
+                ASSERT(0);
+        }
+        /*
+         * Attribute fork settings for new inode.
+         */
+        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+        ip->i_d.di_anextents = 0;
+        /*
+         * Log the new values stuffed into the inode.
+         */
+        xfs_trans_log_inode(tp, ip, flags);
+        /* now that we have a v_type we can set Linux inode ops (& unlock) */
+        VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
+        *ipp = ip;
+        return 0;
+}
+/*
+ * Check to make sure that there are no blocks allocated to the
+ * file beyond the size of the file.  We don't check this for
+ * files with fixed size extents or real time extents, but we
+ * at least do it for regular files.
+ */
+#ifdef DEBUG
+void
+xfs_isize_check(
+        xfs_mount_t     *mp,
+        xfs_inode_t     *ip,
+        xfs_fsize_t     isize)
+{
+        xfs_fileoff_t   map_first;
+        int             nimaps;
+        xfs_bmbt_irec_t imaps[2];
+        if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
+                return;
+        if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME )
+                return;
+        nimaps = 2;
+        map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+        /*
+         * The filesystem could be shutting down, so bmapi may return
+         * an error.
+         */
+        if (xfs_bmapi(NULL, ip, map_first,
+                         (XFS_B_TO_FSB(mp,
+                                       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
+                          map_first),
+                         XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
+                         NULL))
+            return;
+        ASSERT(nimaps == 1);
+        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
+}
+#endif  /* DEBUG */
+/*
+ * Calculate the last possible buffered byte in a file.  This must
+ * include data that was buffered beyond the EOF by the write code.
+ * This also needs to deal with overflowing the xfs_fsize_t type
+ * which can happen for sizes near the limit.
+ *
+ * We also need to take into account any blocks beyond the EOF.  It
+ * may be the case that they were buffered by a write which failed.
+ * In that case the pages will still be in memory, but the inode size
+ * will never have been updated.
+ */
+xfs_fsize_t
+xfs_file_last_byte(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp;
+        xfs_fsize_t     last_byte;
+        xfs_fileoff_t   last_block;
+        xfs_fileoff_t   size_last_block;
+        int             error;
+        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
+        mp = ip->i_mount;
+        /*
+         * Only check for blocks beyond the EOF if the extents have
+         * been read in.  This eliminates the need for the inode lock,
+         * and it also saves us from looking when it really isn't
+         * necessary.
+         */
+        if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+                error = xfs_bmap_last_offset(NULL, ip, &last_block,
+                        XFS_DATA_FORK);
+                if (error) {
+                        last_block = 0;
+                }
+        } else {
+                last_block = 0;
+        }
+        size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
+        last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
+        last_byte = XFS_FSB_TO_B(mp, last_block);
+        if (last_byte < 0) {
+                return XFS_MAXIOFFSET(mp);
+        }
+        last_byte += (1 << mp->m_writeio_log);
+        if (last_byte < 0) {
+                return XFS_MAXIOFFSET(mp);
+        }
+        return last_byte;
+}
+#if defined(XFS_RW_TRACE)
+STATIC void
+xfs_itrunc_trace(
+        int             tag,
+        xfs_inode_t     *ip,
+        int             flag,
+        xfs_fsize_t     new_size,
+        xfs_off_t       toss_start,
+        xfs_off_t       toss_finish)
+{
+        if (ip->i_rwtrace == NULL) {
+                return;
+        }
+        ktrace_enter(ip->i_rwtrace,
+                     (void*)((long)tag),
+                     (void*)ip,
+                     (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
+                     (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
+                     (void*)((long)flag),
+                     (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
+                     (void*)(unsigned long)(new_size & 0xffffffff),
+                     (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
+                     (void*)(unsigned long)(toss_start & 0xffffffff),
+                     (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
+                     (void*)(unsigned long)(toss_finish & 0xffffffff),
+                     (void*)(unsigned long)current_cpu(),
+                     (void*)0,
+                     (void*)0,
+                     (void*)0,
+                     (void*)0);
+}
+#else
+#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
+#endif
+/*
+ * Start the truncation of the file to new_size.  The new size
+ * must be smaller than the current size.  This routine will
+ * clear the buffer and page caches of file data in the removed
+ * range, and xfs_itruncate_finish() will remove the underlying
+ * disk blocks.
+ *
+ * The inode must have its I/O lock locked EXCLUSIVELY, and it
+ * must NOT have the inode lock held at all.  This is because we're
+ * calling into the buffer/page cache code and we can't hold the
+ * inode lock when we do so.
+ *
+ * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
+ * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
+ * in the case that the caller is locking things out of order and
+ * may not be able to call xfs_itruncate_finish() with the inode lock
+ * held without dropping the I/O lock.  If the caller must drop the
+ * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
+ * must be called again with all the same restrictions as the initial
+ * call.
+ */
+void
+xfs_itruncate_start(
+        xfs_inode_t     *ip,
+        uint            flags,
+        xfs_fsize_t     new_size)
+{
+        xfs_fsize_t     last_byte;
+        xfs_off_t       toss_start;
+        xfs_mount_t     *mp;
+        vnode_t         *vp;
+        ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
+        ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+        ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
+               (flags == XFS_ITRUNC_MAYBE));
+        mp = ip->i_mount;
+        vp = XFS_ITOV(ip);
+        /*
+         * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
+         * overlapping the region being removed.  We have to use
+         * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
+         * caller may not be able to finish the truncate without
+         * dropping the inode's I/O lock.  Make sure
+         * to catch any pages brought in by buffers overlapping
+         * the EOF by searching out beyond the isize by our
+         * block size. We round new_size up to a block boundary
+         * so that we don't toss things on the same block as
+         * new_size but before it.
+         *
+         * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
+         * call remapf() over the same region if the file is mapped.
+         * This frees up mapped file references to the pages in the
+         * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
+         * that we get the latest mapped changes flushed out.
+         */
+        toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+        toss_start = XFS_FSB_TO_B(mp, toss_start);
+        if (toss_start < 0) {
+                /*
+                 * The place to start tossing is beyond our maximum
+                 * file size, so there is no way that the data extended
+                 * out there.
+                 */
+                return;
+        }
+        last_byte = xfs_file_last_byte(ip);
+        xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
+                         last_byte);
+        if (last_byte > toss_start) {
+                if (flags & XFS_ITRUNC_DEFINITE) {
+                        VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+                } else {
+                        VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+                }
+        }
+#ifdef DEBUG
+        if (new_size == 0) {
+                ASSERT(VN_CACHED(vp) == 0);
+        }
+#endif
+}
+/*
+ * Shrink the file to the given new_size.  The new
+ * size must be smaller than the current size.
+ * This will free up the underlying blocks
+ * in the removed range after a call to xfs_itruncate_start()
+ * or xfs_atruncate_start().
+ *
+ * The transaction passed to this routine must have made
+ * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
+ * This routine may commit the given transaction and
+ * start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.
+ * Some transaction will be returned to the caller to be
+ * committed.  The incoming transaction must already include
+ * the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On
+ * return the inode will be "held" within the returned transaction.
+ * This routine does NOT require any disk space to be reserved
+ * for it within the transaction.
+ *
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
+ * and it indicates the fork which is to be truncated.  For the
+ * attribute fork we only support truncation to size 0.
+ *
+ * We use the sync parameter to indicate whether or not the first
+ * transaction we perform might have to be synchronous.  For the attr fork,
+ * it needs to be so if the unlink of the inode is not yet known to be
+ * permanent in the log.  This keeps us from freeing and reusing the
+ * blocks of the attribute fork before the unlink of the inode becomes
+ * permanent.
+ *
+ * For the data fork, we normally have to run synchronously if we're
+ * being called out of the inactive path or we're being called
+ * out of the create path where we're truncating an existing file.
+ * Either way, the truncate needs to be sync so blocks don't reappear
+ * in the file with altered data in case of a crash.  wsync filesystems
+ * can run the first case async because anything that shrinks the inode
+ * has to run sync so by the time we're called here from inactive, the
+ * inode size is permanently set to 0.
+ *
+ * Calls from the truncate path always need to be sync unless we're
+ * in a wsync filesystem and the file has already been unlinked.
+ *
+ * The caller is responsible for correctly setting the sync parameter.
+ * It gets too hard for us to guess here which path we're being called
+ * out of just based on inode state.
+ */
+int
+xfs_itruncate_finish(
+        xfs_trans_t     **tp,
+        xfs_inode_t     *ip,
+        xfs_fsize_t     new_size,
+        int             fork,
+        int             sync)
+{
+        xfs_fsblock_t   first_block;
+        xfs_fileoff_t   first_unmap_block;
+        xfs_fileoff_t   last_block;
+        xfs_filblks_t   unmap_len=0;
+        xfs_mount_t     *mp;
+        xfs_trans_t     *ntp;
+        int             done;
+        int             committed;
+        xfs_bmap_free_t free_list;
+        int             error;
+        ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+        ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+        ASSERT(*tp != NULL);
+        ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+        ASSERT(ip->i_transp == *tp);
+        ASSERT(ip->i_itemp != NULL);
+        ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+        ntp = *tp;
+        mp = (ntp)->t_mountp;
+        ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+        /*
+         * We only support truncating the entire attribute fork.
+         */
+        if (fork == XFS_ATTR_FORK) {
+                new_size = 0LL;
+        }
+        first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+        xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
+        /*
+         * The first thing we do is set the size to new_size permanently
+         * on disk.  This way we don't have to worry about anyone ever
+         * being able to look at the data being freed even in the face
+         * of a crash.  What we're getting around here is the case where
+         * we free a block, it is allocated to another file, it is written
+         * to, and then we crash.  If the new data gets written to the
+         * file but the log buffers containing the free and reallocation
+         * don't, then we'd end up with garbage in the blocks being freed.
+         * As long as we make the new_size permanent before actually
+         * freeing any blocks it doesn't matter if they get writtten to.
+         *
+         * The callers must signal into us whether or not the size
+         * setting here must be synchronous.  There are a few cases
+         * where it doesn't have to be synchronous.  Those cases
+         * occur if the file is unlinked and we know the unlink is
+         * permanent or if the blocks being truncated are guaranteed
+         * to be beyond the inode eof (regardless of the link count)
+         * and the eof value is permanent.  Both of these cases occur
+         * only on wsync-mounted filesystems.  In those cases, we're
+         * guaranteed that no user will ever see the data in the blocks
+         * that are being truncated so the truncate can run async.
+         * In the free beyond eof case, the file may wind up with
+         * more blocks allocated to it than it needs if we crash
+         * and that won't get fixed until the next time the file
+         * is re-opened and closed but that's ok as that shouldn't
+         * be too many blocks.
+         *
+         * However, we can't just make all wsync xactions run async
+         * because there's one call out of the create path that needs
+         * to run sync where it's truncating an existing file to size
+         * 0 whose size is > 0.
+         *
+         * It's probably possible to come up with a test in this
+         * routine that would correctly distinguish all the above
+         * cases from the values of the function parameters and the
+         * inode state but for sanity's sake, I've decided to let the
+         * layers above just tell us.  It's simpler to correctly figure
+         * out in the layer above exactly under what conditions we
+         * can run async and I think it's easier for others read and
+         * follow the logic in case something has to be changed.
+         * cscope is your friend -- rcc.
+         *
+         * The attribute fork is much simpler.
+         *
+         * For the attribute fork we allow the caller to tell us whether
+         * the unlink of the inode that led to this call is yet permanent
+         * in the on disk log.  If it is not and we will be freeing extents
+         * in this inode then we make the first transaction synchronous
+         * to make sure that the unlink is permanent by the time we free
+         * the blocks.
+         */
+        if (fork == XFS_DATA_FORK) {
+                if (ip->i_d.di_nextents > 0) {
+                        ip->i_d.di_size = new_size;
+                        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+                }
+        } else if (sync) {
+                ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
+                if (ip->i_d.di_anextents > 0)
+                        xfs_trans_set_sync(ntp);
+        }
+        ASSERT(fork == XFS_DATA_FORK ||
+                (fork == XFS_ATTR_FORK &&
+                        ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
+                         (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
+        /*
+         * Since it is possible for space to become allocated beyond
+         * the end of the file (in a crash where the space is allocated
+         * but the inode size is not yet updated), simply remove any
+         * blocks which show up between the new EOF and the maximum
+         * possible file size.  If the first block to be removed is
+         * beyond the maximum file size (ie it is the same as last_block),
+         * then there is nothing to do.
+         */
+        last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+        ASSERT(first_unmap_block <= last_block);
+        done = 0;
+        if (last_block == first_unmap_block) {
+                done = 1;
+        } else {
+                unmap_len = last_block - first_unmap_block + 1;
+        }
+        while (!done) {
+                /*
+                 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
+                 * will tell us whether it freed the entire range or
+                 * not.  If this is a synchronous mount (wsync),
+                 * then we can tell bunmapi to keep all the
+                 * transactions asynchronous since the unlink
+                 * transaction that made this inode inactive has
+                 * already hit the disk.  There's no danger of
+                 * the freed blocks being reused, there being a
+                 * crash, and the reused blocks suddenly reappearing
+                 * in this file with garbage in them once recovery
+                 * runs.
+                 */
+                XFS_BMAP_INIT(&free_list, &first_block);
+                error = xfs_bunmapi(ntp, ip, first_unmap_block,
+                                    unmap_len,
+                                    XFS_BMAPI_AFLAG(fork) |
+                                      (sync ? 0 : XFS_BMAPI_ASYNC),
+                                    XFS_ITRUNC_MAX_EXTENTS,
+                                    &first_block, &free_list, &done);
+                if (error) {
+                        /*
+                         * If the bunmapi call encounters an error,
+                         * return to the caller where the transaction
+                         * can be properly aborted.  We just need to
+                         * make sure we're not holding any resources
+                         * that we were not when we came in.
+                         */
+                        xfs_bmap_cancel(&free_list);
+                        return error;
+                }
+                /*
+                 * Duplicate the transaction that has the permanent
+                 * reservation and commit the old transaction.
+                 */
+                error = xfs_bmap_finish(tp, &free_list, first_block,
+                                        &committed);
+                ntp = *tp;
+                if (error) {
+                        /*
+                         * If the bmap finish call encounters an error,
+                         * return to the caller where the transaction
+                         * can be properly aborted.  We just need to
+                         * make sure we're not holding any resources
+                         * that we were not when we came in.
+                         *
+                         * Aborting from this point might lose some
+                         * blocks in the file system, but oh well.
+                         */
+                        xfs_bmap_cancel(&free_list);
+                        if (committed) {
+                                /*
+                                 * If the passed in transaction committed
+                                 * in xfs_bmap_finish(), then we want to
+                                 * add the inode to this one before returning.
+                                 * This keeps things simple for the higher
+                                 * level code, because it always knows that
+                                 * the inode is locked and held in the
+                                 * transaction that returns to it whether
+                                 * errors occur or not.  We don't mark the
+                                 * inode dirty so that this transaction can
+                                 * be easily aborted if possible.
+                                 */
+                                xfs_trans_ijoin(ntp, ip,
+                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+                                xfs_trans_ihold(ntp, ip);
+                        }
+                        return error;
+                }
+                if (committed) {
+                        /*
+                         * The first xact was committed,
+                         * so add the inode to the new one.
+                         * Mark it dirty so it will be logged
+                         * and moved forward in the log as
+                         * part of every commit.
+                         */
+                        xfs_trans_ijoin(ntp, ip,
+                                        XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+                        xfs_trans_ihold(ntp, ip);
+                        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+                }
+                ntp = xfs_trans_dup(ntp);
+                (void) xfs_trans_commit(*tp, 0, NULL);
+                *tp = ntp;
+                error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                          XFS_TRANS_PERM_LOG_RES,
+                                          XFS_ITRUNCATE_LOG_COUNT);
+                /*
+                 * Add the inode being truncated to the next chained
+                 * transaction.
+                 */
+                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+                xfs_trans_ihold(ntp, ip);
+                if (error)
+                        return (error);
+        }
+        /*
+         * Only update the size in the case of the data fork, but
+         * always re-log the inode so that our permanent transaction
+         * can keep on rolling it forward in the log.
+         */
+        if (fork == XFS_DATA_FORK) {
+                xfs_isize_check(mp, ip, new_size);
+                ip->i_d.di_size = new_size;
+        }
+        xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+        ASSERT((new_size != 0) ||
+               (fork == XFS_ATTR_FORK) ||
+               (ip->i_delayed_blks == 0));
+        ASSERT((new_size != 0) ||
+               (fork == XFS_ATTR_FORK) ||
+               (ip->i_d.di_nextents == 0));
+        xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
+        return 0;
+}
+/*
+ * xfs_igrow_start
+ *
+ * Do the first part of growing a file: zero any data in the last
+ * block that is beyond the old EOF.  We need to do this before
+ * the inode is joined to the transaction to modify the i_size.
+ * That way we can drop the inode lock and call into the buffer
+ * cache to get the buffer mapping the EOF.
+ */
+int
+xfs_igrow_start(
+        xfs_inode_t     *ip,
+        xfs_fsize_t     new_size,
+        cred_t          *credp)
+{
+        xfs_fsize_t     isize;
+        int             error;
+        ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
+        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+        ASSERT(new_size > ip->i_d.di_size);
+        error = 0;
+        isize = ip->i_d.di_size;
+        /*
+         * Zero any pages that may have been created by
+         * xfs_write_file() beyond the end of the file
+         * and any blocks between the old and new file sizes.
+         */
+        error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
+                                new_size);
+        return error;
+}
+/*
+ * xfs_igrow_finish
+ *
+ * This routine is called to extend the size of a file.
+ * The inode must have both the iolock and the ilock locked
+ * for update and it must be a part of the current transaction.
+ * The xfs_igrow_start() function must have been called previously.
+ * If the change_flag is not zero, the inode change timestamp will
+ * be updated.
+ */
+void
+xfs_igrow_finish(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        xfs_fsize_t     new_size,
+        int             change_flag)
+{
+        ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
+        ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+        ASSERT(ip->i_transp == tp);
+        ASSERT(new_size > ip->i_d.di_size);
+        /*
+         * Update the file size.  Update the inode change timestamp
+         * if change_flag set.
+         */
+        ip->i_d.di_size = new_size;
+        if (change_flag)
+                xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+}
+/*
+ * This is called when the inode's link count goes to 0.
+ * We place the on-disk inode on a list in the AGI.  It
+ * will be pulled from this list when the inode is freed.
+ */
+int
+xfs_iunlink(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp;
+        xfs_agi_t       *agi;
+        xfs_dinode_t    *dip;
+        xfs_buf_t       *agibp;
+        xfs_buf_t       *ibp;
+        xfs_agnumber_t  agno;
+        xfs_daddr_t     agdaddr;
+        xfs_agino_t     agino;
+        short           bucket_index;
+        int             offset;
+        int             error;
+        int             agi_ok;
+        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(ip->i_d.di_mode != 0);
+        ASSERT(ip->i_transp == tp);
+        mp = tp->t_mountp;
+        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
+        /*
+         * Get the agi buffer first.  It ensures lock ordering
+         * on the list.
+         */
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+        if (error) {
+                return error;
+        }
+        /*
+         * Validate the magic number of the agi block.
+         */
+        agi = XFS_BUF_TO_AGI(agibp);
+        agi_ok =
+                INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+                XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
+                        XFS_RANDOM_IUNLINK))) {
+                XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
+                xfs_trans_brelse(tp, agibp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /*
+         * Get the index into the agi hash table for the
+         * list this inode will go on.
+         */
+        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+        ASSERT(agino != 0);
+        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+        ASSERT(agi->agi_unlinked[bucket_index]);
+        ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != agino);
+        if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO) {
+                /*
+                 * There is already another inode in the bucket we need
+                 * to add ourselves to.  Add us at the front of the list.
+                 * Here we put the head pointer into our next pointer,
+                 * and then we fall through to point the head at us.
+                 */
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+                if (error) {
+                        return error;
+                }
+                ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO);
+                ASSERT(dip->di_next_unlinked);
+                /* both on-disk, don't endian flip twice */
+                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
+                offset = ip->i_boffset +
+                        offsetof(xfs_dinode_t, di_next_unlinked);
+                xfs_trans_inode_buf(tp, ibp);
+                xfs_trans_log_buf(tp, ibp, offset,
+                                  (offset + sizeof(xfs_agino_t) - 1));
+                xfs_inobp_check(mp, ibp);
+        }
+        /*
+         * Point the bucket head pointer at the inode being inserted.
+         */
+        ASSERT(agino != 0);
+        INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, agino);
+        offset = offsetof(xfs_agi_t, agi_unlinked) +
+                (sizeof(xfs_agino_t) * bucket_index);
+        xfs_trans_log_buf(tp, agibp, offset,
+                          (offset + sizeof(xfs_agino_t) - 1));
+        return 0;
+}
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip)
+{
+        xfs_ino_t       next_ino;
+        xfs_mount_t     *mp;
+        xfs_agi_t       *agi;
+        xfs_dinode_t    *dip;
+        xfs_buf_t       *agibp;
+        xfs_buf_t       *ibp;
+        xfs_agnumber_t  agno;
+        xfs_daddr_t     agdaddr;
+        xfs_agino_t     agino;
+        xfs_agino_t     next_agino;
+        xfs_buf_t       *last_ibp;
+        xfs_dinode_t    *last_dip;
+        short           bucket_index;
+        int             offset, last_offset;
+        int             error;
+        int             agi_ok;
+        /*
+         * First pull the on-disk inode from the AGI unlinked list.
+         */
+        mp = tp->t_mountp;
+        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
+        /*
+         * Get the agi buffer first.  It ensures lock ordering
+         * on the list.
+         */
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+        if (error) {
+                cmn_err(CE_WARN,
+                        "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
+                        error, mp->m_fsname);
+                return error;
+        }
+        /*
+         * Validate the magic number of the agi block.
+         */
+        agi = XFS_BUF_TO_AGI(agibp);
+        agi_ok =
+                INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+                XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
+                        XFS_RANDOM_IUNLINK_REMOVE))) {
+                XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
+                                     mp, agi);
+                xfs_trans_brelse(tp, agibp);
+                cmn_err(CE_WARN,
+                        "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
+                         mp->m_fsname);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /*
+         * Get the index into the agi hash table for the
+         * list this inode will go on.
+         */
+        agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+        ASSERT(agino != 0);
+        bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+        ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO);
+        ASSERT(agi->agi_unlinked[bucket_index]);
+        if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) == agino) {
+                /*
+                 * We're at the head of the list.  Get the inode's
+                 * on-disk buffer to see if there is anyone after us
+                 * on the list.  Only modify our next pointer if it
+                 * is not already NULLAGINO.  This saves us the overhead
+                 * of dealing with the buffer when there is no need to
+                 * change it.
+                 */
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+                if (error) {
+                        cmn_err(CE_WARN,
+                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                error, mp->m_fsname);
+                        return error;
+                }
+                next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+                ASSERT(next_agino != 0);
+                if (next_agino != NULLAGINO) {
+                        INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+                        offset = ip->i_boffset +
+                                offsetof(xfs_dinode_t, di_next_unlinked);
+                        xfs_trans_inode_buf(tp, ibp);
+                        xfs_trans_log_buf(tp, ibp, offset,
+                                          (offset + sizeof(xfs_agino_t) - 1));
+                        xfs_inobp_check(mp, ibp);
+                } else {
+                        xfs_trans_brelse(tp, ibp);
+                }
+                /*
+                 * Point the bucket head pointer at the next inode.
+                 */
+                ASSERT(next_agino != 0);
+                ASSERT(next_agino != agino);
+                INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, next_agino);
+                offset = offsetof(xfs_agi_t, agi_unlinked) +
+                        (sizeof(xfs_agino_t) * bucket_index);
+                xfs_trans_log_buf(tp, agibp, offset,
+                                  (offset + sizeof(xfs_agino_t) - 1));
+        } else {
+                /*
+                 * We need to search the list for the inode being freed.
+                 */
+                next_agino = INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT);
+                last_ibp = NULL;
+                while (next_agino != agino) {
+                        /*
+                         * If the last inode wasn't the one pointing to
+                         * us, then release its buffer since we're not
+                         * going to do anything with it.
+                         */
+                        if (last_ibp != NULL) {
+                                xfs_trans_brelse(tp, last_ibp);
+                        }
+                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
+                                            &last_ibp, &last_offset);
+                        if (error) {
+                                cmn_err(CE_WARN,
+                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
+                                        error, mp->m_fsname);
+                                return error;
+                        }
+                        next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT);
+                        ASSERT(next_agino != NULLAGINO);
+                        ASSERT(next_agino != 0);
+                }
+                /*
+                 * Now last_ibp points to the buffer previous to us on
+                 * the unlinked list.  Pull us from the list.
+                 */
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+                if (error) {
+                        cmn_err(CE_WARN,
+                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                error, mp->m_fsname);
+                        return error;
+                }
+                next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+                ASSERT(next_agino != 0);
+                ASSERT(next_agino != agino);
+                if (next_agino != NULLAGINO) {
+                        INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+                        offset = ip->i_boffset +
+                                offsetof(xfs_dinode_t, di_next_unlinked);
+                        xfs_trans_inode_buf(tp, ibp);
+                        xfs_trans_log_buf(tp, ibp, offset,
+                                          (offset + sizeof(xfs_agino_t) - 1));
+                        xfs_inobp_check(mp, ibp);
+                } else {
+                        xfs_trans_brelse(tp, ibp);
+                }
+                /*
+                 * Point the previous inode on the list to the next inode.
+                 */
+                INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino);
+                ASSERT(next_agino != 0);
+                offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+                xfs_trans_inode_buf(tp, last_ibp);
+                xfs_trans_log_buf(tp, last_ibp, offset,
+                                  (offset + sizeof(xfs_agino_t) - 1));
+                xfs_inobp_check(mp, last_ibp);
+        }
+        return 0;
+}
+static __inline__ int xfs_inode_clean(xfs_inode_t *ip)
+{
+        return (((ip->i_itemp == NULL) ||
+                !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+                (ip->i_update_core == 0));
+}
+void
+xfs_ifree_cluster(
+        xfs_inode_t     *free_ip,
+        xfs_trans_t     *tp,
+        xfs_ino_t       inum)
+{
+        xfs_mount_t             *mp = free_ip->i_mount;
+        int                     blks_per_cluster;
+        int                     nbufs;
+        int                     ninodes;
+        int                     i, j, found, pre_flushed;
+        xfs_daddr_t             blkno;
+        xfs_buf_t               *bp;
+        xfs_ihash_t             *ih;
+        xfs_inode_t             *ip, **ip_found;
+        xfs_inode_log_item_t    *iip;
+        xfs_log_item_t          *lip;
+        SPLDECL(s);
+        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+                blks_per_cluster = 1;
+                ninodes = mp->m_sb.sb_inopblock;
+                nbufs = XFS_IALLOC_BLOCKS(mp);
+        } else {
+                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+                                        mp->m_sb.sb_blocksize;
+                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
+        }
+        ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
+        for (j = 0; j < nbufs; j++, inum += ninodes) {
+                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
+                                         XFS_INO_TO_AGBNO(mp, inum));
+                /*
+                 * Look for each inode in memory and attempt to lock it,
+                 * we can be racing with flush and tail pushing here.
+                 * any inode we get the locks on, add to an array of
+                 * inode items to process later.
+                 *
+                 * The get the buffer lock, we could beat a flush
+                 * or tail pushing thread to the lock here, in which
+                 * case they will go looking for the inode buffer
+                 * and fail, we need some other form of interlock
+                 * here.
+                 */
+                found = 0;
+                for (i = 0; i < ninodes; i++) {
+                        ih = XFS_IHASH(mp, inum + i);
+                        read_lock(&ih->ih_lock);
+                        for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+                                if (ip->i_ino == inum + i)
+                                        break;
+                        }
+                        /* Inode not in memory or we found it already,
+                         * nothing to do
+                         */
+                        if (!ip || (ip->i_flags & XFS_ISTALE)) {
+                                read_unlock(&ih->ih_lock);
+                                continue;
+                        }
+                        if (xfs_inode_clean(ip)) {
+                                read_unlock(&ih->ih_lock);
+                                continue;
+                        }
+                        /* If we can get the locks then add it to the
+                         * list, otherwise by the time we get the bp lock
+                         * below it will already be attached to the
+                         * inode buffer.
+                         */
+                        /* This inode will already be locked - by us, lets
+                         * keep it that way.
+                         */
+                        if (ip == free_ip) {
+                                if (xfs_iflock_nowait(ip)) {
+                                        ip->i_flags |= XFS_ISTALE;
+                                        if (xfs_inode_clean(ip)) {
+                                                xfs_ifunlock(ip);
+                                        } else {
+                                                ip_found[found++] = ip;
+                                        }
+                                }
+                                read_unlock(&ih->ih_lock);
+                                continue;
+                        }
+                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                                if (xfs_iflock_nowait(ip)) {
+                                        ip->i_flags |= XFS_ISTALE;
+                                        if (xfs_inode_clean(ip)) {
+                                                xfs_ifunlock(ip);
+                                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                        } else {
+                                                ip_found[found++] = ip;
+                                        }
+                                } else {
+                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                }
+                        }
+                        read_unlock(&ih->ih_lock);
+                }
+                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
+                                        mp->m_bsize * blks_per_cluster,
+                                        XFS_BUF_LOCK);
+                pre_flushed = 0;
+                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                while (lip) {
+                        if (lip->li_type == XFS_LI_INODE) {
+                                iip = (xfs_inode_log_item_t *)lip;
+                                ASSERT(iip->ili_logged == 1);
+                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+                                AIL_LOCK(mp,s);
+                                iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                                AIL_UNLOCK(mp, s);
+                                iip->ili_inode->i_flags |= XFS_ISTALE;
+                                pre_flushed++;
+                        }
+                        lip = lip->li_bio_list;
+                }
+                for (i = 0; i < found; i++) {
+                        ip = ip_found[i];
+                        iip = ip->i_itemp;
+                        if (!iip) {
+                                ip->i_update_core = 0;
+                                xfs_ifunlock(ip);
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                continue;
+                        }
+                        iip->ili_last_fields = iip->ili_format.ilf_fields;
+                        iip->ili_format.ilf_fields = 0;
+                        iip->ili_logged = 1;
+                        AIL_LOCK(mp,s);
+                        iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                        AIL_UNLOCK(mp, s);
+                        xfs_buf_attach_iodone(bp,
+                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
+                                xfs_istale_done, (xfs_log_item_t *)iip);
+                        if (ip != free_ip) {
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        }
+                }
+                if (found || pre_flushed)
+                        xfs_trans_stale_inode_buf(tp, bp);
+                xfs_trans_binval(tp, bp);
+        }
+        kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
+}
+/*
+ * This is called to return an inode to the inode free list.
+ * The inode should already be truncated to 0 length and have
+ * no pages associated with it.  This routine also assumes that
+ * the inode is already a part of the transaction.
+ *
+ * The on-disk copy of the inode will have been added to the list
+ * of unlinked inodes in the AGI. We need to remove the inode from
+ * that list atomically with respect to freeing it here.
+ */
+int
+xfs_ifree(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        xfs_bmap_free_t *flist)
+{
+        int                     error;
+        int                     delete;
+        xfs_ino_t               first_ino;
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+        ASSERT(ip->i_transp == tp);
+        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(ip->i_d.di_nextents == 0);
+        ASSERT(ip->i_d.di_anextents == 0);
+        ASSERT((ip->i_d.di_size == 0) ||
+               ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
+        ASSERT(ip->i_d.di_nblocks == 0);
+        /*
+         * Pull the on-disk inode from the AGI unlinked list.
+         */
+        error = xfs_iunlink_remove(tp, ip);
+        if (error != 0) {
+                return error;
+        }
+        error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
+        if (error != 0) {
+                return error;
+        }
+        ip->i_d.di_mode = 0;            /* mark incore inode as free */
+        ip->i_d.di_flags = 0;
+        ip->i_d.di_dmevmask = 0;
+        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
+        ip->i_df.if_ext_max =
+                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+        /*
+         * Bump the generation count so no one will be confused
+         * by reincarnations of this inode.
+         */
+        ip->i_d.di_gen++;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        if (delete) {
+                xfs_ifree_cluster(ip, tp, first_ino);
+        }
+        return 0;
+}
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.  When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we adding records one will be allocated.  The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *       requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+        xfs_inode_t             *ip,
+        int                     rec_diff,
+        int                     whichfork)
+{
+        int                     cur_max;
+        xfs_ifork_t             *ifp;
+        xfs_bmbt_block_t        *new_broot;
+        int                     new_max;
+        size_t                  new_size;
+        char                    *np;
+        char                    *op;
+        /*
+         * Handle the degenerate case quietly.
+         */
+        if (rec_diff == 0) {
+                return;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (rec_diff > 0) {
+                /*
+                 * If there wasn't any memory allocated before, just
+                 * allocate it now and get out.
+                 */
+                if (ifp->if_broot_bytes == 0) {
+                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
+                        ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
+                                                                     KM_SLEEP);
+                        ifp->if_broot_bytes = (int)new_size;
+                        return;
+                }
+                /*
+                 * If there is already an existing if_broot, then we need
+                 * to realloc() it and shift the pointers to their new
+                 * location.  The records don't change location because
+                 * they are kept butted up against the btree block header.
+                 */
+                cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+                new_max = cur_max + rec_diff;
+                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+                ifp->if_broot = (xfs_bmbt_block_t *)
+                  kmem_realloc(ifp->if_broot,
+                                new_size,
+                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
+                                KM_SLEEP);
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                                                      ifp->if_broot_bytes);
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                                                      (int)new_size);
+                ifp->if_broot_bytes = (int)new_size;
+                ASSERT(ifp->if_broot_bytes <=
+                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+                return;
+        }
+        /*
+         * rec_diff is less than 0.  In this case, we are shrinking the
+         * if_broot buffer.  It must already exist.  If we go to zero
+         * records, just get rid of the root and clear the status bit.
+         */
+        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+        cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+        new_max = cur_max + rec_diff;
+        ASSERT(new_max >= 0);
+        if (new_max > 0)
+                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+        else
+                new_size = 0;
+        if (new_size > 0) {
+                new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+                /*
+                 * First copy over the btree block header.
+                 */
+                memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
+        } else {
+                new_broot = NULL;
+                ifp->if_flags &= ~XFS_IFBROOT;
+        }
+        /*
+         * Only copy the records and pointers if there are any.
+         */
+        if (new_max > 0) {
+                /*
+                 * First copy the records.
+                 */
+                op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
+                                                     ifp->if_broot_bytes);
+                np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
+                                                     (int)new_size);
+                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+                /*
+                 * Then copy the pointers.
+                 */
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                                                     ifp->if_broot_bytes);
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+                                                     (int)new_size);
+                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+        }
+        kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+        ifp->if_broot = new_broot;
+        ifp->if_broot_bytes = (int)new_size;
+        ASSERT(ifp->if_broot_bytes <=
+                XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+        return;
+}
+/*
+ * This is called when the amount of space needed for if_extents
+ * is increased or decreased.  The change in size is indicated by
+ * the number of extents that need to be added or deleted in the
+ * ext_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_extents area is changing
+ * ext_diff -- the change in the number of extents, positive or negative,
+ *       requested for the if_extents array.
+ */
+void
+xfs_iext_realloc(
+        xfs_inode_t     *ip,
+        int             ext_diff,
+        int             whichfork)
+{
+        int             byte_diff;
+        xfs_ifork_t     *ifp;
+        int             new_size;
+        uint            rnew_size;
+        if (ext_diff == 0) {
+                return;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t);
+        new_size = (int)ifp->if_bytes + byte_diff;
+        ASSERT(new_size >= 0);
+        if (new_size == 0) {
+                if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
+                        ASSERT(ifp->if_real_bytes != 0);
+                        kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+                }
+                ifp->if_u1.if_extents = NULL;
+                rnew_size = 0;
+        } else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) {
+                /*
+                 * If the valid extents can fit in if_inline_ext,
+                 * copy them from the malloc'd vector and free it.
+                 */
+                if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
+                        /*
+                         * For now, empty files are format EXTENTS,
+                         * so the if_extents pointer is null.
+                         */
+                        if (ifp->if_u1.if_extents) {
+                                memcpy(ifp->if_u2.if_inline_ext,
+                                        ifp->if_u1.if_extents, new_size);
+                                kmem_free(ifp->if_u1.if_extents,
+                                          ifp->if_real_bytes);
+                        }
+                        ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+                }
+                rnew_size = 0;
+        } else {
+                rnew_size = new_size;
+                if ((rnew_size & (rnew_size - 1)) != 0)
+                        rnew_size = xfs_iroundup(rnew_size);
+                /*
+                 * Stuck with malloc/realloc.
+                 */
+                if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) {
+                        ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
+                                kmem_alloc(rnew_size, KM_SLEEP);
+                        memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
+                              sizeof(ifp->if_u2.if_inline_ext));
+                } else if (rnew_size != ifp->if_real_bytes) {
+                        ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
+                          kmem_realloc(ifp->if_u1.if_extents,
+                                        rnew_size,
+                                        ifp->if_real_bytes,
+                                        KM_NOFS);
+                }
+        }
+        ifp->if_real_bytes = rnew_size;
+        ifp->if_bytes = new_size;
+}
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *       requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+        xfs_inode_t     *ip,
+        int             byte_diff,
+        int             whichfork)
+{
+        xfs_ifork_t     *ifp;
+        int             new_size;
+        int             real_size;
+        if (byte_diff == 0) {
+                return;
+        }
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        new_size = (int)ifp->if_bytes + byte_diff;
+        ASSERT(new_size >= 0);
+        if (new_size == 0) {
+                if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+                }
+                ifp->if_u1.if_data = NULL;
+                real_size = 0;
+        } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+                /*
+                 * If the valid extents/data can fit in if_inline_ext/data,
+                 * copy them from the malloc'd vector and free it.
+                 */
+                if (ifp->if_u1.if_data == NULL) {
+                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                        ASSERT(ifp->if_real_bytes != 0);
+                        memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+                              new_size);
+                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+                        ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+                }
+                real_size = 0;
+        } else {
+                /*
+                 * Stuck with malloc/realloc.
+                 * For inline data, the underlying buffer must be
+                 * a multiple of 4 bytes in size so that it can be
+                 * logged and stay on word boundaries.  We enforce
+                 * that here.
+                 */
+                real_size = roundup(new_size, 4);
+                if (ifp->if_u1.if_data == NULL) {
+                        ASSERT(ifp->if_real_bytes == 0);
+                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+                        /*
+                         * Only do the realloc if the underlying size
+                         * is really changing.
+                         */
+                        if (ifp->if_real_bytes != real_size) {
+                                ifp->if_u1.if_data =
+                                        kmem_realloc(ifp->if_u1.if_data,
+                                                        real_size,
+                                                        ifp->if_real_bytes,
+                                                        KM_SLEEP);
+                        }
+                } else {
+                        ASSERT(ifp->if_real_bytes == 0);
+                        ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+                        memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+                                ifp->if_bytes);
+                }
+        }
+        ifp->if_real_bytes = real_size;
+        ifp->if_bytes = new_size;
+        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+/*
+ * Map inode to disk block and offset.
+ *
+ * mp -- the mount point structure for the current file system
+ * tp -- the current transaction
+ * ino -- the inode number of the inode to be located
+ * imap -- this structure is filled in with the information necessary
+ *       to retrieve the given inode from disk
+ * flags -- flags to pass to xfs_dilocate indicating whether or not
+ *       lookups in the inode btree were OK or not
+ */
+int
+xfs_imap(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        xfs_imap_t      *imap,
+        uint            flags)
+{
+        xfs_fsblock_t   fsbno;
+        int             len;
+        int             off;
+        int             error;
+        fsbno = imap->im_blkno ?
+                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
+        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
+        if (error != 0) {
+                return error;
+        }
+        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
+        imap->im_len = XFS_FSB_TO_BB(mp, len);
+        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
+        imap->im_ioffset = (ushort)off;
+        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+        return 0;
+}
+void
+xfs_idestroy_fork(
+        xfs_inode_t     *ip,
+        int             whichfork)
+{
+        xfs_ifork_t     *ifp;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (ifp->if_broot != NULL) {
+                kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+                ifp->if_broot = NULL;
+        }
+        /*
+         * If the format is local, then we can't have an extents
+         * array so just look for an inline data array.  If we're
+         * not local then we may or may not have an extents list,
+         * so check and free it up if we do.
+         */
+        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+                    (ifp->if_u1.if_data != NULL)) {
+                        ASSERT(ifp->if_real_bytes != 0);
+                        kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+                        ifp->if_u1.if_data = NULL;
+                        ifp->if_real_bytes = 0;
+                }
+        } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+                   (ifp->if_u1.if_extents != NULL) &&
+                   (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) {
+                ASSERT(ifp->if_real_bytes != 0);
+                kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+                ifp->if_u1.if_extents = NULL;
+                ifp->if_real_bytes = 0;
+        }
+        ASSERT(ifp->if_u1.if_extents == NULL ||
+               ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+        ASSERT(ifp->if_real_bytes == 0);
+        if (whichfork == XFS_ATTR_FORK) {
+                kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+                ip->i_afp = NULL;
+        }
+}
+/*
+ * This is called free all the memory associated with an inode.
+ * It must free the inode itself and any buffers allocated for
+ * if_extents/if_data and if_broot.  It must also free the lock
+ * associated with the inode.
+ */
+void
+xfs_idestroy(
+        xfs_inode_t     *ip)
+{
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
+        }
+        if (ip->i_afp)
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+        mrfree(&ip->i_lock);
+        mrfree(&ip->i_iolock);
+        freesema(&ip->i_flock);
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BMBT_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /* XXXdpd should be able to assert this but shutdown
+                 * is leaving the AIL behind. */
+                ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) ||
+                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                xfs_inode_item_destroy(ip);
+        }
+        kmem_zone_free(xfs_inode_zone, ip);
+}
+/*
+ * Increment the pin count of the given buffer.
+ * This value is protected by ipinlock spinlock in the mount structure.
+ */
+void
+xfs_ipin(
+        xfs_inode_t     *ip)
+{
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+        atomic_inc(&ip->i_pincount);
+}
+/*
+ * Decrement the pin count of the given inode, and wake up
+ * anyone in xfs_iwait_unpin() if the count goes to 0.  The
+ * inode must have been previoulsy pinned with a call to xfs_ipin().
+ */
+void
+xfs_iunpin(
+        xfs_inode_t     *ip)
+{
+        ASSERT(atomic_read(&ip->i_pincount) > 0);
+        if (atomic_dec_and_test(&ip->i_pincount)) {
+                vnode_t *vp = XFS_ITOV_NULL(ip);
+                /* make sync come back and flush this inode */
+                if (vp) {
+                        struct inode    *inode = LINVFS_GET_IP(vp);
+                        if (!(inode->i_state & I_NEW))
+                                mark_inode_dirty_sync(inode);
+                }
+                wake_up(&ip->i_ipin_wait);
+        }
+}
+/*
+ * This is called to wait for the given inode to be unpinned.
+ * It will sleep until this happens.  The caller must have the
+ * inode locked in at least shared mode so that the buffer cannot
+ * be subsequently pinned once someone is waiting for it to be
+ * unpinned.
+ */
+void
+xfs_iunpin_wait(
+        xfs_inode_t     *ip)
+{
+        xfs_inode_log_item_t    *iip;
+        xfs_lsn_t       lsn;
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
+        if (atomic_read(&ip->i_pincount) == 0) {
+                return;
+        }
+        iip = ip->i_itemp;
+        if (iip && iip->ili_last_lsn) {
+                lsn = iip->ili_last_lsn;
+        } else {
+                lsn = (xfs_lsn_t)0;
+        }
+        /*
+         * Give the log a push so we don't wait here too long.
+         */
+        xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+        wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+}
+/*
+ * xfs_iextents_copy()
+ *
+ * This is called to copy the REAL extents (as opposed to the delayed
+ * allocation extents) from the inode into the given buffer.  It
+ * returns the number of bytes copied into the buffer.
+ *
+ * If there are no delayed allocation extents, then we can just
+ * memcpy() the extents into the buffer.  Otherwise, we need to
+ * examine each extent in turn and skip those which are delayed.
+ */
+int
+xfs_iextents_copy(
+        xfs_inode_t             *ip,
+        xfs_bmbt_rec_t          *buffer,
+        int                     whichfork)
+{
+        int                     copied;
+        xfs_bmbt_rec_t          *dest_ep;
+        xfs_bmbt_rec_t          *ep;
+#ifdef XFS_BMAP_TRACE
+        static char             fname[] = "xfs_iextents_copy";
+#endif
+        int                     i;
+        xfs_ifork_t             *ifp;
+        int                     nrecs;
+        xfs_fsblock_t           start_block;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+        ASSERT(ifp->if_bytes > 0);
+        nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+        xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork);
+        ASSERT(nrecs > 0);
+        /*
+         * There are some delayed allocation extents in the
+         * inode, so copy the extents one at a time and skip
+         * the delayed ones.  There must be at least one
+         * non-delayed extent.
+         */
+        ep = ifp->if_u1.if_extents;
+        dest_ep = buffer;
+        copied = 0;
+        for (i = 0; i < nrecs; i++) {
+                start_block = xfs_bmbt_get_startblock(ep);
+                if (ISNULLSTARTBLOCK(start_block)) {
+                        /*
+                         * It's a delayed allocation extent, so skip it.
+                         */
+                        ep++;
+                        continue;
+                }
+                /* Translate to on disk format */
+                put_unaligned(INT_GET(ep->l0, ARCH_CONVERT),
+                              (__uint64_t*)&dest_ep->l0);
+                put_unaligned(INT_GET(ep->l1, ARCH_CONVERT),
+                              (__uint64_t*)&dest_ep->l1);
+                dest_ep++;
+                ep++;
+                copied++;
+        }
+        ASSERT(copied != 0);
+        xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip));
+        return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_iflush_fork(
+        xfs_inode_t             *ip,
+        xfs_dinode_t            *dip,
+        xfs_inode_log_item_t    *iip,
+        int                     whichfork,
+        xfs_buf_t               *bp)
+{
+        char                    *cp;
+        xfs_ifork_t             *ifp;
+        xfs_mount_t             *mp;
+#ifdef XFS_TRANS_DEBUG
+        int                     first;
+#endif
+        static const short      brootflag[2] =
+                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+        static const short      dataflag[2] =
+                { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+        static const short      extflag[2] =
+                { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+        if (iip == NULL)
+                return 0;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        /*
+         * This can happen if we gave up in iformat in an error path,
+         * for the attribute fork.
+         */
+        if (ifp == NULL) {
+                ASSERT(whichfork == XFS_ATTR_FORK);
+                return 0;
+        }
+        cp = XFS_DFORK_PTR(dip, whichfork);
+        mp = ip->i_mount;
+        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+        case XFS_DINODE_FMT_LOCAL:
+                if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+                    (ifp->if_bytes > 0)) {
+                        ASSERT(ifp->if_u1.if_data != NULL);
+                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+                        memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+                }
+                if (whichfork == XFS_DATA_FORK) {
+                        if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
+                                XFS_ERROR_REPORT("xfs_iflush_fork",
+                                                 XFS_ERRLEVEL_LOW, mp);
+                                return XFS_ERROR(EFSCORRUPTED);
+                        }
+                }
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+                       !(iip->ili_format.ilf_fields & extflag[whichfork]));
+                ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0));
+                ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0));
+                if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+                    (ifp->if_bytes > 0)) {
+                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+                        (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
+                                whichfork);
+                }
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+                    (ifp->if_broot_bytes > 0)) {
+                        ASSERT(ifp->if_broot != NULL);
+                        ASSERT(ifp->if_broot_bytes <=
+                               (XFS_IFORK_SIZE(ip, whichfork) +
+                                XFS_BROOT_SIZE_ADJ));
+                        xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+                                (xfs_bmdr_block_t *)cp,
+                                XFS_DFORK_SIZE(dip, mp, whichfork));
+                }
+                break;
+        case XFS_DINODE_FMT_DEV:
+                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+                        ASSERT(whichfork == XFS_DATA_FORK);
+                        INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev);
+                }
+                break;
+        case XFS_DINODE_FMT_UUID:
+                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+                        ASSERT(whichfork == XFS_DATA_FORK);
+                        memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
+                                sizeof(uuid_t));
+                }
+                break;
+        default:
+                ASSERT(0);
+                break;
+        }
+        return 0;
+}
+/*
+ * xfs_iflush() will write a modified inode's changes out to the
+ * inode's on disk home.  The caller must have the inode lock held
+ * in at least shared mode and the inode flush semaphore must be
+ * held as well.  The inode lock will still be held upon return from
+ * the call and the caller is free to unlock it.
+ * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The flags indicate how the inode's buffer should be written out.
+ */
+int
+xfs_iflush(
+        xfs_inode_t             *ip,
+        uint                    flags)
+{
+        xfs_inode_log_item_t    *iip;
+        xfs_buf_t               *bp;
+        xfs_dinode_t            *dip;
+        xfs_mount_t             *mp;
+        int                     error;
+        /* REFERENCED */
+        xfs_chash_t             *ch;
+        xfs_inode_t             *iq;
+        int                     clcount;        /* count of inodes clustered */
+        int                     bufwasdelwri;
+        enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
+        SPLDECL(s);
+        XFS_STATS_INC(xs_iflush_count);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+        ASSERT(valusema(&ip->i_flock) <= 0);
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               ip->i_d.di_nextents > ip->i_df.if_ext_max);
+        iip = ip->i_itemp;
+        mp = ip->i_mount;
+        /*
+         * If the inode isn't dirty, then just release the inode
+         * flush lock and do nothing.
+         */
+        if ((ip->i_update_core == 0) &&
+            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+                ASSERT((iip != NULL) ?
+                         !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
+                xfs_ifunlock(ip);
+                return 0;
+        }
+        /*
+         * We can't flush the inode until it is unpinned, so
+         * wait for it.  We know noone new can pin it, because
+         * we are holding the inode lock shared and you need
+         * to hold it exclusively to pin the inode.
+         */
+        xfs_iunpin_wait(ip);
+        /*
+         * This may have been unpinned because the filesystem is shutting
+         * down forcibly. If that's the case we must not write this inode
+         * to disk, because the log record didn't make it to disk!
+         */
+        if (XFS_FORCED_SHUTDOWN(mp)) {
+                ip->i_update_core = 0;
+                if (iip)
+                        iip->ili_format.ilf_fields = 0;
+                xfs_ifunlock(ip);
+                return XFS_ERROR(EIO);
+        }
+        /*
+         * Get the buffer containing the on-disk inode.
+         */
+        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
+        if (error != 0) {
+                xfs_ifunlock(ip);
+                return error;
+        }
+        /*
+         * Decide how buffer will be flushed out.  This is done before
+         * the call to xfs_iflush_int because this field is zeroed by it.
+         */
+        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+                /*
+                 * Flush out the inode buffer according to the directions
+                 * of the caller.  In the cases where the caller has given
+                 * us a choice choose the non-delwri case.  This is because
+                 * the inode is in the AIL and we need to get it out soon.
+                 */
+                switch (flags) {
+                case XFS_IFLUSH_SYNC:
+                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
+                        flags = 0;
+                        break;
+                case XFS_IFLUSH_ASYNC:
+                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+                        flags = INT_ASYNC;
+                        break;
+                case XFS_IFLUSH_DELWRI:
+                        flags = INT_DELWRI;
+                        break;
+                default:
+                        ASSERT(0);
+                        flags = 0;
+                        break;
+                }
+        } else {
+                switch (flags) {
+                case XFS_IFLUSH_DELWRI_ELSE_SYNC:
+                case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+                case XFS_IFLUSH_DELWRI:
+                        flags = INT_DELWRI;
+                        break;
+                case XFS_IFLUSH_ASYNC:
+                        flags = INT_ASYNC;
+                        break;
+                case XFS_IFLUSH_SYNC:
+                        flags = 0;
+                        break;
+                default:
+                        ASSERT(0);
+                        flags = 0;
+                        break;
+                }
+        }
+        /*
+         * First flush out the inode that xfs_iflush was called with.
+         */
+        error = xfs_iflush_int(ip, bp);
+        if (error) {
+                goto corrupt_out;
+        }
+        /*
+         * inode clustering:
+         * see if other inodes can be gathered into this write
+         */
+        ip->i_chash->chl_buf = bp;
+        ch = XFS_CHASH(mp, ip->i_blkno);
+        s = mutex_spinlock(&ch->ch_lock);
+        clcount = 0;
+        for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
+                /*
+                 * Do an un-protected check to see if the inode is dirty and
+                 * is a candidate for flushing.  These checks will be repeated
+                 * later after the appropriate locks are acquired.
+                 */
+                iip = iq->i_itemp;
+                if ((iq->i_update_core == 0) &&
+                    ((iip == NULL) ||
+                     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+                      xfs_ipincount(iq) == 0) {
+                        continue;
+                }
+                /*
+                 * Try to get locks.  If any are unavailable,
+                 * then this inode cannot be flushed and is skipped.
+                 */
+                /* get inode locks (just i_lock) */
+                if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
+                        /* get inode flush lock */
+                        if (xfs_iflock_nowait(iq)) {
+                                /* check if pinned */
+                                if (xfs_ipincount(iq) == 0) {
+                                        /* arriving here means that
+                                         * this inode can be flushed.
+                                         * first re-check that it's
+                                         * dirty
+                                         */
+                                        iip = iq->i_itemp;
+                                        if ((iq->i_update_core != 0)||
+                                            ((iip != NULL) &&
+                                             (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+                                                clcount++;
+                                                error = xfs_iflush_int(iq, bp);
+                                                if (error) {
+                                                        xfs_iunlock(iq,
+                                                                    XFS_ILOCK_SHARED);
+                                                        goto cluster_corrupt_out;
+                                                }
+                                        } else {
+                                                xfs_ifunlock(iq);
+                                        }
+                                } else {
+                                        xfs_ifunlock(iq);
+                                }
+                        }
+                        xfs_iunlock(iq, XFS_ILOCK_SHARED);
+                }
+        }
+        mutex_spinunlock(&ch->ch_lock, s);
+        if (clcount) {
+                XFS_STATS_INC(xs_icluster_flushcnt);
+                XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+        }
+        /*
+         * If the buffer is pinned then push on the log so we won't
+         * get stuck waiting in the write for too long.
+         */
+        if (XFS_BUF_ISPINNED(bp)){
+                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+        }
+        if (flags & INT_DELWRI) {
+                xfs_bdwrite(mp, bp);
+        } else if (flags & INT_ASYNC) {
+                xfs_bawrite(mp, bp);
+        } else {
+                error = xfs_bwrite(mp, bp);
+        }
+        return error;
+corrupt_out:
+        xfs_buf_relse(bp);
+        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+        xfs_iflush_abort(ip);
+        /*
+         * Unlocks the flush lock
+         */
+        return XFS_ERROR(EFSCORRUPTED);
+cluster_corrupt_out:
+        /* Corruption detected in the clustering loop.  Invalidate the
+         * inode buffer and shut down the filesystem.
+         */
+        mutex_spinunlock(&ch->ch_lock, s);
+        /*
+         * Clean up the buffer.  If it was B_DELWRI, just release it --
+         * brelse can handle it with no problems.  If not, shut down the
+         * filesystem before releasing the buffer.
+         */
+        if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
+                xfs_buf_relse(bp);
+        }
+        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+        if(!bufwasdelwri)  {
+                /*
+                 * Just like incore_relse: if we have b_iodone functions,
+                 * mark the buffer as an error and call them.  Otherwise
+                 * mark it as stale and brelse.
+                 */
+                if (XFS_BUF_IODONE_FUNC(bp)) {
+                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+                        XFS_BUF_UNDONE(bp);
+                        XFS_BUF_STALE(bp);
+                        XFS_BUF_SHUT(bp);
+                        XFS_BUF_ERROR(bp,EIO);
+                        xfs_biodone(bp);
+                } else {
+                        XFS_BUF_STALE(bp);
+                        xfs_buf_relse(bp);
+                }
+        }
+        xfs_iflush_abort(iq);
+        /*
+         * Unlocks the flush lock
+         */
+        return XFS_ERROR(EFSCORRUPTED);
+}
+STATIC int
+xfs_iflush_int(
+        xfs_inode_t             *ip,
+        xfs_buf_t               *bp)
+{
+        xfs_inode_log_item_t    *iip;
+        xfs_dinode_t            *dip;
+        xfs_mount_t             *mp;
+#ifdef XFS_TRANS_DEBUG
+        int                     first;
+#endif
+        SPLDECL(s);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+        ASSERT(valusema(&ip->i_flock) <= 0);
+        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+               ip->i_d.di_nextents > ip->i_df.if_ext_max);
+        iip = ip->i_itemp;
+        mp = ip->i_mount;
+        /*
+         * If the inode isn't dirty, then just release the inode
+         * flush lock and do nothing.
+         */
+        if ((ip->i_update_core == 0) &&
+            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+                xfs_ifunlock(ip);
+                return 0;
+        }
+        /* set *dip = inode's place in the buffer */
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+        /*
+         * Clear i_update_core before copying out the data.
+         * This is for coordination with our timestamp updates
+         * that don't hold the inode lock. They will always
+         * update the timestamps BEFORE setting i_update_core,
+         * so if we clear i_update_core after they set it we
+         * are guaranteed to see their updates to the timestamps.
+         * I believe that this depends on strongly ordered memory
+         * semantics, but we have that.  We use the SYNCHRONIZE
+         * macro to make sure that the compiler does not reorder
+         * the i_update_core access below the data copy below.
+         */
+        ip->i_update_core = 0;
+        SYNCHRONIZE();
+        if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
+                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
+                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+                        ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
+                goto corrupt_out;
+        }
+        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
+                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
+                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+                        ip->i_ino, ip, ip->i_d.di_magic);
+                goto corrupt_out;
+        }
+        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+                if (XFS_TEST_ERROR(
+                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
+                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                                "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
+                                ip->i_ino, ip);
+                        goto corrupt_out;
+                }
+        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+                if (XFS_TEST_ERROR(
+                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
+                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                                "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
+                                ip->i_ino, ip);
+                        goto corrupt_out;
+                }
+        }
+        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
+                                XFS_RANDOM_IFLUSH_5)) {
+                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        ip->i_ino,
+                        ip->i_d.di_nextents + ip->i_d.di_anextents,
+                        ip->i_d.di_nblocks,
+                        ip);
+                goto corrupt_out;
+        }
+        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
+                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
+                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+                        ip->i_ino, ip->i_d.di_forkoff, ip);
+                goto corrupt_out;
+        }
+        /*
+         * bump the flush iteration count, used to detect flushes which
+         * postdate a log record during recovery.
+         */
+        ip->i_d.di_flushiter++;
+        /*
+         * Copy the dirty parts of the inode into the on-disk
+         * inode.  We always copy out the core of the inode,
+         * because if the inode is dirty at all the core must
+         * be.
+         */
+        xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1);
+        /* Wrap, we never let the log put out DI_MAX_FLUSH */
+        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
+                ip->i_d.di_flushiter = 0;
+        /*
+         * If this is really an old format inode and the superblock version
+         * has not been updated to support only new format inodes, then
+         * convert back to the old inode format.  If the superblock version
+         * has been updated, then make the conversion permanent.
+         */
+        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+               XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+                if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+                        /*
+                         * Convert it back.
+                         */
+                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+                        INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink);
+                } else {
+                        /*
+                         * The superblock version has already been bumped,
+                         * so just make the conversion to the new inode
+                         * format permanent.
+                         */
+                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2);
+                        ip->i_d.di_onlink = 0;
+                        dip->di_core.di_onlink = 0;
+                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+                        memset(&(dip->di_core.di_pad[0]), 0,
+                              sizeof(dip->di_core.di_pad));
+                        ASSERT(ip->i_d.di_projid == 0);
+                }
+        }
+        if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
+                goto corrupt_out;
+        }
+        if (XFS_IFORK_Q(ip)) {
+                /*
+                 * The only error from xfs_iflush_fork is on the data fork.
+                 */
+                (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+        }
+        xfs_inobp_check(mp, bp);
+        /*
+         * We've recorded everything logged in the inode, so we'd
+         * like to clear the ilf_fields bits so we don't log and
+         * flush things unnecessarily.  However, we can't stop
+         * logging all this information until the data we've copied
+         * into the disk buffer is written to disk.  If we did we might
+         * overwrite the copy of the inode in the log with all the
+         * data after re-logging only part of it, and in the face of
+         * a crash we wouldn't have all the data we need to recover.
+         *
+         * What we do is move the bits to the ili_last_fields field.
+         * When logging the inode, these bits are moved back to the
+         * ilf_fields field.  In the xfs_iflush_done() routine we
+         * clear ili_last_fields, since we know that the information
+         * those bits represent is permanently on disk.  As long as
+         * the flush completes before the inode is logged again, then
+         * both ilf_fields and ili_last_fields will be cleared.
+         *
+         * We can play with the ilf_fields bits here, because the inode
+         * lock must be held exclusively in order to set bits there
+         * and the flush lock protects the ili_last_fields bits.
+         * Set ili_logged so the flush done
+         * routine can tell whether or not to look in the AIL.
+         * Also, store the current LSN of the inode so that we can tell
+         * whether the item has moved in the AIL from xfs_iflush_done().
+         * In order to read the lsn we need the AIL lock, because
+         * it is a 64 bit value that cannot be read atomically.
+         */
+        if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+                iip->ili_last_fields = iip->ili_format.ilf_fields;
+                iip->ili_format.ilf_fields = 0;
+                iip->ili_logged = 1;
+                ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+                AIL_LOCK(mp,s);
+                iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                AIL_UNLOCK(mp, s);
+                /*
+                 * Attach the function xfs_iflush_done to the inode's
+                 * buffer.  This will remove the inode from the AIL
+                 * and unlock the inode's flush lock when the inode is
+                 * completely written to disk.
+                 */
+                xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
+                                      xfs_iflush_done, (xfs_log_item_t *)iip);
+                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+                ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
+        } else {
+                /*
+                 * We're flushing an inode which is not in the AIL and has
+                 * not been logged but has i_update_core set.  For this
+                 * case we can use a B_DELWRI flush and immediately drop
+                 * the inode flush lock because we can avoid the whole
+                 * AIL state thing.  It's OK to drop the flush lock now,
+                 * because we've already locked the buffer and to do anything
+                 * you really need both.
+                 */
+                if (iip != NULL) {
+                        ASSERT(iip->ili_logged == 0);
+                        ASSERT(iip->ili_last_fields == 0);
+                        ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
+                }
+                xfs_ifunlock(ip);
+        }
+        return 0;
+corrupt_out:
+        return XFS_ERROR(EFSCORRUPTED);
+}
+/*
+ * Flush all inactive inodes in mp.  Return true if no user references
+ * were found, false otherwise.
+ */
+int
+xfs_iflush_all(
+        xfs_mount_t     *mp,
+        int             flag)
+{
+        int             busy;
+        int             done;
+        int             purged;
+        xfs_inode_t     *ip;
+        vmap_t          vmap;
+        vnode_t         *vp;
+        busy = done = 0;
+        while (!done) {
+                purged = 0;
+                XFS_MOUNT_ILOCK(mp);
+                ip = mp->m_inodes;
+                if (ip == NULL) {
+                        break;
+                }
+                do {
+                        /* Make sure we skip markers inserted by sync */
+                        if (ip->i_mount == NULL) {
+                                ip = ip->i_mnext;
+                                continue;
+                        }
+                        /*
+                         * It's up to our caller to purge the root
+                         * and quota vnodes later.
+                         */
+                        vp = XFS_ITOV_NULL(ip);
+                        if (!vp) {
+                                XFS_MOUNT_IUNLOCK(mp);
+                                xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
+                                purged = 1;
+                                break;
+                        }
+                        if (vn_count(vp) != 0) {
+                                if (vn_count(vp) == 1 &&
+                                    (ip == mp->m_rootip ||
+                                     (mp->m_quotainfo &&
+                                      (ip->i_ino == mp->m_sb.sb_uquotino ||
+                                       ip->i_ino == mp->m_sb.sb_gquotino)))) {
+                                        ip = ip->i_mnext;
+                                        continue;
+                                }
+                                if (!(flag & XFS_FLUSH_ALL)) {
+                                        busy = 1;
+                                        done = 1;
+                                        break;
+                                }
+                                /*
+                                 * Ignore busy inodes but continue flushing
+                                 * others.
+                                 */
+                                ip = ip->i_mnext;
+                                continue;
+                        }
+                        /*
+                         * Sample vp mapping while holding mp locked on MP
+                         * systems, so we don't purge a reclaimed or
+                         * nonexistent vnode.  We break from the loop
+                         * since we know that we modify
+                         * it by pulling ourselves from it in xfs_reclaim()
+                         * called via vn_purge() below.  Set ip to the next
+                         * entry in the list anyway so we'll know below
+                         * whether we reached the end or not.
+                         */
+                        VMAP(vp, vmap);
+                        XFS_MOUNT_IUNLOCK(mp);
+                        vn_purge(vp, &vmap);
+                        purged = 1;
+                        break;
+                } while (ip != mp->m_inodes);
+                /*
+                 * We need to distinguish between when we exit the loop
+                 * after a purge and when we simply hit the end of the
+                 * list.  We can't use the (ip == mp->m_inodes) test,
+                 * because when we purge an inode at the start of the list
+                 * the next inode on the list becomes mp->m_inodes.  That
+                 * would cause such a test to bail out early.  The purged
+                 * variable tells us how we got out of the loop.
+                 */
+                if (!purged) {
+                        done = 1;
+                }
+        }
+        XFS_MOUNT_IUNLOCK(mp);
+        return !busy;
+}
+/*
+ * xfs_iaccess: check accessibility of inode for mode.
+ */
+int
+xfs_iaccess(
+        xfs_inode_t     *ip,
+        mode_t          mode,
+        cred_t          *cr)
+{
+        int             error;
+        mode_t          orgmode = mode;
+        struct inode    *inode = LINVFS_GET_IP(XFS_ITOV(ip));
+        if (mode & S_IWUSR) {
+                umode_t         imode = inode->i_mode;
+                if (IS_RDONLY(inode) &&
+                    (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
+                        return XFS_ERROR(EROFS);
+                if (IS_IMMUTABLE(inode))
+                        return XFS_ERROR(EACCES);
+        }
+        /*
+         * If there's an Access Control List it's used instead of
+         * the mode bits.
+         */
+        if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
+                return error ? XFS_ERROR(error) : 0;
+        if (current_fsuid(cr) != ip->i_d.di_uid) {
+                mode >>= 3;
+                if (!in_group_p((gid_t)ip->i_d.di_gid))
+                        mode >>= 3;
+        }
+        /*
+         * If the DACs are ok we don't need any capability check.
+         */
+        if ((ip->i_d.di_mode & mode) == mode)
+                return 0;
+        /*
+         * Read/write DACs are always overridable.
+         * Executable DACs are overridable if at least one exec bit is set.
+         */
+        if (!(orgmode & S_IXUSR) ||
+            (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
+                if (capable_cred(cr, CAP_DAC_OVERRIDE))
+                        return 0;
+        if ((orgmode == S_IRUSR) ||
+            (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) {
+                if (capable_cred(cr, CAP_DAC_READ_SEARCH))
+                        return 0;
+#ifdef  NOISE
+                cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode);
+#endif  /* NOISE */
+                return XFS_ERROR(EACCES);
+        }
+        return XFS_ERROR(EACCES);
+}
+/*
+ * xfs_iroundup: round up argument to next power of two
+ */
+uint
+xfs_iroundup(
+        uint    v)
+{
+        int i;
+        uint m;
+        if ((v & (v - 1)) == 0)
+                return v;
+        ASSERT((v & 0x80000000) == 0);
+        if ((v & (v + 1)) == 0)
+                return v + 1;
+        for (i = 0, m = 1; i < 31; i++, m <<= 1) {
+                if (v & m)
+                        continue;
+                v |= m;
+                if ((v & (v + 1)) == 0)
+                        return v + 1;
+        }
+        ASSERT(0);
+        return( 0 );
+}
+/*
+ * Change the requested timestamp in the given inode.
+ * We don't lock across timestamp updates, and we don't log them but
+ * we do record the fact that there is dirty information in core.
+ *
+ * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
+ *              with XFS_ICHGTIME_ACC to be sure that access time
+ *              update will take.  Calling first with XFS_ICHGTIME_ACC
+ *              and then XFS_ICHGTIME_MOD may fail to modify the access
+ *              timestamp if the filesystem is mounted noacctm.
+ */
+void
+xfs_ichgtime(xfs_inode_t *ip,
+             int flags)
+{
+        timespec_t      tv;
+        vnode_t         *vp = XFS_ITOV(ip);
+        struct inode    *inode = LINVFS_GET_IP(vp);
+        /*
+         * We're not supposed to change timestamps in readonly-mounted
+         * filesystems.  Throw it away if anyone asks us.
+         */
+        if (unlikely(vp->v_vfsp->vfs_flag & VFS_RDONLY))
+                return;
+        /*
+         * Don't update access timestamps on reads if mounted "noatime"
+         * Throw it away if anyone asks us.
+         */
+        if ((ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) &&
+            ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG))
+                        == XFS_ICHGTIME_ACC))
+                return;
+        nanotime(&tv);
+        if (flags & XFS_ICHGTIME_MOD) {
+                VN_MTIMESET(vp, &tv);
+                ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+                ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+        }
+        if (flags & XFS_ICHGTIME_ACC) {
+                VN_ATIMESET(vp, &tv);
+                ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
+                ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
+        }
+        if (flags & XFS_ICHGTIME_CHG) {
+                VN_CTIMESET(vp, &tv);
+                ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
+                ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+        }
+        /*
+         * We update the i_update_core field _after_ changing
+         * the timestamps in order to coordinate properly with
+         * xfs_iflush() so that we don't lose timestamp updates.
+         * This keeps us from having to hold the inode lock
+         * while doing this.  We use the SYNCHRONIZE macro to
+         * ensure that the compiler does not reorder the update
+         * of i_update_core above the timestamp updates above.
+         */
+        SYNCHRONIZE();
+        ip->i_update_core = 1;
+        if (!(inode->i_state & I_LOCK))
+                mark_inode_dirty_sync(inode);
+}
+#ifdef XFS_ILOCK_TRACE
+ktrace_t        *xfs_ilock_trace_buf;
+void
+xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
+{
+        ktrace_enter(ip->i_lock_trace,
+                     (void *)ip,
+                     (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
+                     (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
+                     (void *)ra,                /* caller of ilock */
+                     (void *)(unsigned long)current_cpu(),
+                     (void *)(unsigned long)current_pid(),
+                     NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
+}
+#endif
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
new file mode 100644
index 000000000000..a53b1ccf6070
--- /dev/null
+++ b/fs/xfs/xfs_inode.h
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INODE_H__
+#define __XFS_INODE_H__
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define XFS_INLINE_EXTS 2
+#define XFS_INLINE_DATA 32
+typedef struct xfs_ifork {
+        int                     if_bytes;       /* bytes in if_u1 */
+        int                     if_real_bytes;  /* bytes allocated in if_u1 */
+        xfs_bmbt_block_t        *if_broot;      /* file's incore btree root */
+        short                   if_broot_bytes; /* bytes allocated for root */
+        unsigned char           if_flags;       /* per-fork flags */
+        unsigned char           if_ext_max;     /* max # of extent records */
+        xfs_extnum_t            if_lastex;      /* last if_extents used */
+        union {
+                xfs_bmbt_rec_t  *if_extents;    /* linear map file exts */
+                char            *if_data;       /* inline file data */
+        } if_u1;
+        union {
+                xfs_bmbt_rec_t  if_inline_ext[XFS_INLINE_EXTS];
+                                                /* very small file extents */
+                char            if_inline_data[XFS_INLINE_DATA];
+                                                /* very small file data */
+                xfs_dev_t       if_rdev;        /* dev number if special */
+                uuid_t          if_uuid;        /* mount point value */
+        } if_u2;
+} xfs_ifork_t;
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+#define XFS_ICHGTIME_ACC        0x2     /* data fork access timestamp */
+#define XFS_ICHGTIME_CHG        0x4     /* inode field change timestamp */
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE    0x0001  /* Inline data is read in */
+#define XFS_IFEXTENTS   0x0002  /* All extent pointers are read in */
+#define XFS_IFBROOT     0x0004  /* i_broot points to the bmap b-tree root */
+/*
+ * Flags for xfs_imap() and xfs_dilocate().
+ */
+#define XFS_IMAP_LOOKUP         0x1
+/*
+ * Maximum number of extent pointers in if_u1.if_extents.
+ */
+#define XFS_MAX_INCORE_EXTENTS  32768
+#ifdef __KERNEL__
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct vnode;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_bmbt_block;
+struct xfs_inode;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+#if defined(XFS_ILOCK_TRACE)
+#define XFS_ILOCK_KTRACE_SIZE   32
+extern ktrace_t *xfs_ilock_trace_buf;
+extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
+#else
+#define xfs_ilock_trace(i,n,f,ra)
+#endif
+/*
+ * This structure is used to communicate which extents of a file
+ * were holes when a write started from xfs_write_file() to
+ * xfs_strat_read().  This is necessary so that we can know which
+ * blocks need to be zeroed when they are read in in xfs_strat_read()
+ * if they weren\'t allocated when the buffer given to xfs_strat_read()
+ * was mapped.
+ *
+ * We keep a list of these attached to the inode.  The list is
+ * protected by the inode lock and the fact that the io lock is
+ * held exclusively by writers.
+ */
+typedef struct xfs_gap {
+        struct xfs_gap  *xg_next;
+        xfs_fileoff_t   xg_offset_fsb;
+        xfs_extlen_t    xg_count_fsb;
+} xfs_gap_t;
+typedef struct dm_attrs_s {
+        __uint32_t      da_dmevmask;    /* DMIG event mask */
+        __uint16_t      da_dmstate;     /* DMIG state info */
+        __uint16_t      da_pad;         /* DMIG extra padding */
+} dm_attrs_t;
+typedef struct xfs_iocore {
+        void                    *io_obj;        /* pointer to container
+                                                 * inode or dcxvn structure */
+        struct xfs_mount        *io_mount;      /* fs mount struct ptr */
+#ifdef DEBUG
+        mrlock_t                *io_lock;       /* inode IO lock */
+        mrlock_t                *io_iolock;     /* inode IO lock */
+#endif
+        /* I/O state */
+        xfs_fsize_t             io_new_size;    /* sz when write completes */
+        /* Miscellaneous state. */
+        unsigned int            io_flags;       /* IO related flags */
+        /* DMAPI state */
+        dm_attrs_t              io_dmattrs;
+} xfs_iocore_t;
+#define        io_dmevmask     io_dmattrs.da_dmevmask
+#define        io_dmstate      io_dmattrs.da_dmstate
+#define XFS_IO_INODE(io)        ((xfs_inode_t *) ((io)->io_obj))
+#define XFS_IO_DCXVN(io)        ((dcxvn_t *) ((io)->io_obj))
+/*
+ * Flags in the flags field
+ */
+#define XFS_IOCORE_RT           0x1
+/*
+ * xfs_iocore prototypes
+ */
+extern void xfs_iocore_inode_init(struct xfs_inode *);
+extern void xfs_iocore_inode_reinit(struct xfs_inode *);
+/*
+ * This is the type used in the xfs inode hash table.
+ * An array of these is allocated for each mounted
+ * file system to hash the inodes for that file system.
+ */
+typedef struct xfs_ihash {
+        struct xfs_inode        *ih_next;
+        rwlock_t                ih_lock;
+        uint                    ih_version;
+} xfs_ihash_t;
+#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize))
+/*
+ * This is the xfs inode cluster hash.  This hash is used by xfs_iflush to
+ * find inodes that share a cluster and can be flushed to disk at the same
+ * time.
+ */
+typedef struct xfs_chashlist {
+        struct xfs_chashlist    *chl_next;
+        struct xfs_inode        *chl_ip;
+        xfs_daddr_t             chl_blkno;      /* starting block number of
+                                                 * the cluster */
+        struct xfs_buf          *chl_buf;       /* the inode buffer */
+} xfs_chashlist_t;
+typedef struct xfs_chash {
+        xfs_chashlist_t         *ch_list;
+        lock_t                  ch_lock;
+} xfs_chash_t;
+#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
+/*
+ * This is the xfs in-core inode structure.
+ * Most of the on-disk inode is embedded in the i_d field.
+ *
+ * The extent pointers/inline file space, however, are managed
+ * separately.  The memory for this information is pointed to by
+ * the if_u1 unions depending on the type of the data.
+ * This is used to linearize the array of extents for fast in-core
+ * access.  This is used until the file's number of extents
+ * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
+ * are accessed through the buffer cache.
+ *
+ * Other state kept in the in-core inode is used for identification,
+ * locking, transactional updating, etc of the inode.
+ *
+ * Generally, we do not want to hold the i_rlock while holding the
+ * i_ilock. Hierarchy is i_iolock followed by i_rlock.
+ *
+ * xfs_iptr_t contains all the inode fields upto and including the
+ * i_mnext and i_mprev fields, it is used as a marker in the inode
+ * chain off the mount structure by xfs_sync calls.
+ */
+typedef struct {
+        struct xfs_ihash        *ip_hash;       /* pointer to hash header */
+        struct xfs_inode        *ip_next;       /* inode hash link forw */
+        struct xfs_inode        *ip_mnext;      /* next inode in mount list */
+        struct xfs_inode        *ip_mprev;      /* ptr to prev inode */
+        struct xfs_inode        **ip_prevp;     /* ptr to prev i_next */
+        struct xfs_mount        *ip_mount;      /* fs mount struct ptr */
+} xfs_iptr_t;
+typedef struct xfs_inode {
+        /* Inode linking and identification information. */
+        struct xfs_ihash        *i_hash;        /* pointer to hash header */
+        struct xfs_inode        *i_next;        /* inode hash link forw */
+        struct xfs_inode        *i_mnext;       /* next inode in mount list */
+        struct xfs_inode        *i_mprev;       /* ptr to prev inode */
+        struct xfs_inode        **i_prevp;      /* ptr to prev i_next */
+        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
+        struct list_head        i_reclaim;      /* reclaim list */
+        struct bhv_desc         i_bhv_desc;     /* inode behavior descriptor*/
+        struct xfs_dquot        *i_udquot;      /* user dquot */
+        struct xfs_dquot        *i_gdquot;      /* group dquot */
+        /* Inode location stuff */
+        xfs_ino_t               i_ino;          /* inode number (agno/agino)*/
+        xfs_daddr_t             i_blkno;        /* blkno of inode buffer */
+        ushort                  i_len;          /* len of inode buffer */
+        ushort                  i_boffset;      /* off of inode in buffer */
+        /* Extent information. */
+        xfs_ifork_t             *i_afp;         /* attribute fork pointer */
+        xfs_ifork_t             i_df;           /* data fork */
+        /* Transaction and locking information. */
+        struct xfs_trans        *i_transp;      /* ptr to owning transaction*/
+        struct xfs_inode_log_item *i_itemp;     /* logging information */
+        mrlock_t                i_lock;         /* inode lock */
+        mrlock_t                i_iolock;       /* inode IO lock */
+        sema_t                  i_flock;        /* inode flush lock */
+        atomic_t                i_pincount;     /* inode pin count */
+        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
+#ifdef HAVE_REFCACHE
+        struct xfs_inode        **i_refcache;   /* ptr to entry in ref cache */
+        struct xfs_inode        *i_release;     /* inode to unref */
+#endif
+        /* I/O state */
+        xfs_iocore_t            i_iocore;       /* I/O core */
+        /* Miscellaneous state. */
+        unsigned short          i_flags;        /* see defined flags below */
+        unsigned char           i_update_core;  /* timestamps/size is dirty */
+        unsigned char           i_update_size;  /* di_size field is dirty */
+        unsigned int            i_gen;          /* generation count */
+        unsigned int            i_delayed_blks; /* count of delay alloc blks */
+        xfs_dinode_core_t       i_d;            /* most of ondisk inode */
+        xfs_chashlist_t         *i_chash;       /* cluster hash list header */
+        struct xfs_inode        *i_cnext;       /* cluster hash link forward */
+        struct xfs_inode        *i_cprev;       /* cluster hash link backward */
+        /* Trace buffers per inode. */
+#ifdef XFS_BMAP_TRACE
+        struct ktrace           *i_xtrace;      /* inode extent list trace */
+#endif
+#ifdef XFS_BMBT_TRACE
+        struct ktrace           *i_btrace;      /* inode bmap btree trace */
+#endif
+#ifdef XFS_RW_TRACE
+        struct ktrace           *i_rwtrace;     /* inode read/write trace */
+#endif
+#ifdef XFS_ILOCK_TRACE
+        struct ktrace           *i_lock_trace;  /* inode lock/unlock trace */
+#endif
+#ifdef XFS_DIR2_TRACE
+        struct ktrace           *i_dir_trace;   /* inode directory trace */
+#endif
+} xfs_inode_t;
+#endif  /* __KERNEL__ */
+/*
+ * Fork handling.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_PTR)
+xfs_ifork_t *xfs_ifork_ptr(xfs_inode_t *ip, int w);
+#define XFS_IFORK_PTR(ip,w)             xfs_ifork_ptr(ip,w)
+#else
+#define XFS_IFORK_PTR(ip,w)   ((w) == XFS_DATA_FORK ? &(ip)->i_df : (ip)->i_afp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_Q)
+int xfs_ifork_q(xfs_inode_t *ip);
+#define XFS_IFORK_Q(ip)                 xfs_ifork_q(ip)
+#else
+#define XFS_IFORK_Q(ip)                 XFS_CFORK_Q(&(ip)->i_d)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_DSIZE)
+int xfs_ifork_dsize(xfs_inode_t *ip);
+#define XFS_IFORK_DSIZE(ip)             xfs_ifork_dsize(ip)
+#else
+#define XFS_IFORK_DSIZE(ip)             XFS_CFORK_DSIZE(&ip->i_d, ip->i_mount)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_ASIZE)
+int xfs_ifork_asize(xfs_inode_t *ip);
+#define XFS_IFORK_ASIZE(ip)             xfs_ifork_asize(ip)
+#else
+#define XFS_IFORK_ASIZE(ip)             XFS_CFORK_ASIZE(&ip->i_d, ip->i_mount)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_SIZE)
+int xfs_ifork_size(xfs_inode_t *ip, int w);
+#define XFS_IFORK_SIZE(ip,w)            xfs_ifork_size(ip,w)
+#else
+#define XFS_IFORK_SIZE(ip,w)            XFS_CFORK_SIZE(&ip->i_d, ip->i_mount, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FORMAT)
+int xfs_ifork_format(xfs_inode_t *ip, int w);
+#define XFS_IFORK_FORMAT(ip,w)          xfs_ifork_format(ip,w)
+#else
+#define XFS_IFORK_FORMAT(ip,w)          XFS_CFORK_FORMAT(&ip->i_d, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FMT_SET)
+void xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n);
+#define XFS_IFORK_FMT_SET(ip,w,n)       xfs_ifork_fmt_set(ip,w,n)
+#else
+#define XFS_IFORK_FMT_SET(ip,w,n)       XFS_CFORK_FMT_SET(&ip->i_d, w, n)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXTENTS)
+int xfs_ifork_nextents(xfs_inode_t *ip, int w);
+#define XFS_IFORK_NEXTENTS(ip,w)        xfs_ifork_nextents(ip,w)
+#else
+#define XFS_IFORK_NEXTENTS(ip,w)        XFS_CFORK_NEXTENTS(&ip->i_d, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXT_SET)
+void xfs_ifork_next_set(xfs_inode_t *ip, int w, int n);
+#define XFS_IFORK_NEXT_SET(ip,w,n)      xfs_ifork_next_set(ip,w,n)
+#else
+#define XFS_IFORK_NEXT_SET(ip,w,n)      XFS_CFORK_NEXT_SET(&ip->i_d, w, n)
+#endif
+#ifdef __KERNEL__
+/*
+ * In-core inode flags.
+ */
+#define XFS_IGRIO       0x0001  /* inode used for guaranteed rate i/o */
+#define XFS_IUIOSZ      0x0002  /* inode i/o sizes have been explicitly set */
+#define XFS_IQUIESCE    0x0004  /* we have started quiescing for this inode */
+#define XFS_IRECLAIM    0x0008  /* we have started reclaiming this inode    */
+#define XFS_ISTALE      0x0010  /* inode has been staled */
+#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
+#define XFS_INEW        0x0040
+/*
+ * Flags for inode locking.
+ */
+#define XFS_IOLOCK_EXCL         0x001
+#define XFS_IOLOCK_SHARED       0x002
+#define XFS_ILOCK_EXCL          0x004
+#define XFS_ILOCK_SHARED        0x008
+#define XFS_IUNLOCK_NONOTIFY    0x010
+#define XFS_EXTENT_TOKEN_RD     0x040
+#define XFS_SIZE_TOKEN_RD       0x080
+#define XFS_EXTSIZE_RD          (XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD)
+#define XFS_WILLLEND            0x100   /* Always acquire tokens for lending */
+#define XFS_EXTENT_TOKEN_WR     (XFS_EXTENT_TOKEN_RD | XFS_WILLLEND)
+#define XFS_SIZE_TOKEN_WR       (XFS_SIZE_TOKEN_RD | XFS_WILLLEND)
+#define XFS_EXTSIZE_WR          (XFS_EXTSIZE_RD | XFS_WILLLEND)
+#define XFS_LOCK_MASK   \
+        (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \
+         XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \
+         XFS_WILLLEND)
+/*
+ * Flags for xfs_iflush()
+ */
+#define XFS_IFLUSH_DELWRI_ELSE_SYNC     1
+#define XFS_IFLUSH_DELWRI_ELSE_ASYNC    2
+#define XFS_IFLUSH_SYNC                 3
+#define XFS_IFLUSH_ASYNC                4
+#define XFS_IFLUSH_DELWRI               5
+/*
+ * Flags for xfs_iflush_all.
+ */
+#define XFS_FLUSH_ALL           0x1
+/*
+ * Flags for xfs_itruncate_start().
+ */
+#define XFS_ITRUNC_DEFINITE     0x1
+#define XFS_ITRUNC_MAYBE        0x2
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOV)
+struct vnode *xfs_itov(xfs_inode_t *ip);
+#define XFS_ITOV(ip)            xfs_itov(ip)
+#else
+#define XFS_ITOV(ip)            BHV_TO_VNODE(XFS_ITOBHV(ip))
+#endif
+#define XFS_ITOV_NULL(ip)       BHV_TO_VNODE_NULL(XFS_ITOBHV(ip))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOBHV)
+struct bhv_desc *xfs_itobhv(xfs_inode_t *ip);
+#define XFS_ITOBHV(ip)          xfs_itobhv(ip)
+#else
+#define XFS_ITOBHV(ip)          ((struct bhv_desc *)(&((ip)->i_bhv_desc)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOI)
+xfs_inode_t *xfs_bhvtoi(struct bhv_desc *bhvp);
+#define XFS_BHVTOI(bhvp)        xfs_bhvtoi(bhvp)
+#else
+#define XFS_BHVTOI(bhvp)        \
+        ((xfs_inode_t *)((char *)(bhvp) - \
+                         (char *)&(((xfs_inode_t *)0)->i_bhv_desc)))
+#endif
+#define BHV_IS_XFS(bdp)         (BHV_OPS(bdp) == &xfs_vnodeops)
+/*
+ * For multiple groups support: if S_ISGID bit is set in the parent
+ * directory, group of new file is set to that of the parent, and
+ * new subdirectory gets S_ISGID bit from parent.
+ */
+#define XFS_INHERIT_GID(pip, vfsp)      \
+        (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID))
+/*
+ * xfs_iget.c prototypes.
+ */
+#define IGET_CREATE     1
+void            xfs_ihash_init(struct xfs_mount *);
+void            xfs_ihash_free(struct xfs_mount *);
+void            xfs_chash_init(struct xfs_mount *);
+void            xfs_chash_free(struct xfs_mount *);
+xfs_inode_t     *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
+                                  struct xfs_trans *);
+void            xfs_inode_lock_init(xfs_inode_t *, struct vnode *);
+int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+                         uint, uint, xfs_inode_t **, xfs_daddr_t);
+void            xfs_iput(xfs_inode_t *, uint);
+void            xfs_iput_new(xfs_inode_t *, uint);
+void            xfs_ilock(xfs_inode_t *, uint);
+int             xfs_ilock_nowait(xfs_inode_t *, uint);
+void            xfs_iunlock(xfs_inode_t *, uint);
+void            xfs_ilock_demote(xfs_inode_t *, uint);
+void            xfs_iflock(xfs_inode_t *);
+int             xfs_iflock_nowait(xfs_inode_t *);
+uint            xfs_ilock_map_shared(xfs_inode_t *);
+void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
+void            xfs_ifunlock(xfs_inode_t *);
+void            xfs_ireclaim(xfs_inode_t *);
+int             xfs_finish_reclaim(xfs_inode_t *, int, int);
+int             xfs_finish_reclaim_all(struct xfs_mount *, int);
+/*
+ * xfs_inode.c prototypes.
+ */
+int             xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+                            xfs_dinode_t **, struct xfs_buf **, int *);
+int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+                          xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **,
+                          xfs_daddr_t);
+int             xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+                          xfs_inode_t **, xfs_daddr_t);
+int             xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
+int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, nlink_t,
+                           xfs_dev_t, struct cred *, xfs_prid_t, int,
+                           struct xfs_buf **, boolean_t *, xfs_inode_t **);
+void            xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *,
+                                        int);
+uint            xfs_ip2xflags(struct xfs_inode *);
+uint            xfs_dic2xflags(struct xfs_dinode_core *);
+int             xfs_ifree(struct xfs_trans *, xfs_inode_t *,
+                           struct xfs_bmap_free *);
+void            xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
+int             xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
+                                     xfs_fsize_t, int, int);
+int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
+int             xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *);
+void            xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *,
+                                 xfs_fsize_t, int);
+void            xfs_idestroy_fork(xfs_inode_t *, int);
+void            xfs_idestroy(xfs_inode_t *);
+void            xfs_idata_realloc(xfs_inode_t *, int, int);
+void            xfs_iextract(xfs_inode_t *);
+void            xfs_iext_realloc(xfs_inode_t *, int, int);
+void            xfs_iroot_realloc(xfs_inode_t *, int, int);
+void            xfs_ipin(xfs_inode_t *);
+void            xfs_iunpin(xfs_inode_t *);
+int             xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
+int             xfs_iflush(xfs_inode_t *, uint);
+int             xfs_iflush_all(struct xfs_mount *, int);
+int             xfs_iaccess(xfs_inode_t *, mode_t, cred_t *);
+uint            xfs_iroundup(uint);
+void            xfs_ichgtime(xfs_inode_t *, int);
+xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
+void            xfs_lock_inodes(xfs_inode_t **, int, int, uint);
+#define xfs_ipincount(ip)       ((unsigned int) atomic_read(&ip->i_pincount))
+#ifdef DEBUG
+void            xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+#else   /* DEBUG */
+#define xfs_isize_check(mp, ip, isize)
+#endif  /* DEBUG */
+#if defined(DEBUG)
+void            xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+extern struct kmem_zone *xfs_chashlist_zone;
+extern struct kmem_zone *xfs_ifork_zone;
+extern struct kmem_zone *xfs_inode_zone;
+extern struct kmem_zone *xfs_ili_zone;
+extern struct vnodeops  xfs_vnodeops;
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
new file mode 100644
index 000000000000..768cb1816b8e
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.c
@@ -0,0 +1,1092 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * This file contains the implementation of the xfs_inode_log_item.
+ * It contains the item operations used to manipulate the inode log
+ * items as well as utility routines used by the inode specific
+ * transaction routines.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_ag.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_rw.h"
+kmem_zone_t     *xfs_ili_zone;          /* inode log item zone */
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC uint
+xfs_inode_item_size(
+        xfs_inode_log_item_t    *iip)
+{
+        uint            nvecs;
+        xfs_inode_t     *ip;
+        ip = iip->ili_inode;
+        nvecs = 2;
+        /*
+         * Only log the data/extents/b-tree root if there is something
+         * left to log.
+         */
+        iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+        switch (ip->i_d.di_format) {
+        case XFS_DINODE_FMT_EXTENTS:
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                          XFS_ILOG_DEV | XFS_ILOG_UUID);
+                if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
+                    (ip->i_d.di_nextents > 0) &&
+                    (ip->i_df.if_bytes > 0)) {
+                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
+                        nvecs++;
+                } else {
+                        iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
+                }
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                ASSERT(ip->i_df.if_ext_max ==
+                       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+                          XFS_ILOG_DEV | XFS_ILOG_UUID);
+                if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
+                    (ip->i_df.if_broot_bytes > 0)) {
+                        ASSERT(ip->i_df.if_broot != NULL);
+                        nvecs++;
+                } else {
+                        ASSERT(!(iip->ili_format.ilf_fields &
+                                 XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+                        if (iip->ili_root_size > 0) {
+                                ASSERT(iip->ili_root_size ==
+                                       ip->i_df.if_broot_bytes);
+                                ASSERT(memcmp(iip->ili_orig_root,
+                                            ip->i_df.if_broot,
+                                            iip->ili_root_size) == 0);
+                        } else {
+                                ASSERT(ip->i_df.if_broot_bytes == 0);
+                        }
+#endif
+                        iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
+                }
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+                          XFS_ILOG_DEV | XFS_ILOG_UUID);
+                if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
+                    (ip->i_df.if_bytes > 0)) {
+                        ASSERT(ip->i_df.if_u1.if_data != NULL);
+                        ASSERT(ip->i_d.di_size > 0);
+                        nvecs++;
+                } else {
+                        iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
+                }
+                break;
+        case XFS_DINODE_FMT_DEV:
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                          XFS_ILOG_DEXT | XFS_ILOG_UUID);
+                break;
+        case XFS_DINODE_FMT_UUID:
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                          XFS_ILOG_DEXT | XFS_ILOG_DEV);
+                break;
+        default:
+                ASSERT(0);
+                break;
+        }
+        /*
+         * If there are no attributes associated with this file,
+         * then there cannot be anything more to log.
+         * Clear all attribute-related log flags.
+         */
+        if (!XFS_IFORK_Q(ip)) {
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+                return nvecs;
+        }
+        /*
+         * Log any necessary attribute data.
+         */
+        switch (ip->i_d.di_aformat) {
+        case XFS_DINODE_FMT_EXTENTS:
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+                if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
+                    (ip->i_d.di_anextents > 0) &&
+                    (ip->i_afp->if_bytes > 0)) {
+                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+                        nvecs++;
+                } else {
+                        iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
+                }
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+                if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
+                    (ip->i_afp->if_broot_bytes > 0)) {
+                        ASSERT(ip->i_afp->if_broot != NULL);
+                        nvecs++;
+                } else {
+                        iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
+                }
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+                iip->ili_format.ilf_fields &=
+                        ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+                if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
+                    (ip->i_afp->if_bytes > 0)) {
+                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
+                        nvecs++;
+                } else {
+                        iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
+                }
+                break;
+        default:
+                ASSERT(0);
+                break;
+        }
+        return nvecs;
+}
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode log item.  It fills the first item with an inode
+ * log format structure, the second with the on-disk inode structure,
+ * and a possible third and/or fourth with the inode data/extents/b-tree
+ * root and inode attributes data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+        xfs_inode_log_item_t    *iip,
+        xfs_log_iovec_t         *log_vector)
+{
+        uint                    nvecs;
+        xfs_log_iovec_t         *vecp;
+        xfs_inode_t             *ip;
+        size_t                  data_bytes;
+        xfs_bmbt_rec_t          *ext_buffer;
+        int                     nrecs;
+        xfs_mount_t             *mp;
+        ip = iip->ili_inode;
+        vecp = log_vector;
+        vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
+        vecp->i_len  = sizeof(xfs_inode_log_format_t);
+        vecp++;
+        nvecs        = 1;
+        /*
+         * Clear i_update_core if the timestamps (or any other
+         * non-transactional modification) need flushing/logging
+         * and we're about to log them with the rest of the core.
+         *
+         * This is the same logic as xfs_iflush() but this code can't
+         * run at the same time as xfs_iflush because we're in commit
+         * processing here and so we have the inode lock held in
+         * exclusive mode.  Although it doesn't really matter
+         * for the timestamps if both routines were to grab the
+         * timestamps or not.  That would be ok.
+         *
+         * We clear i_update_core before copying out the data.
+         * This is for coordination with our timestamp updates
+         * that don't hold the inode lock. They will always
+         * update the timestamps BEFORE setting i_update_core,
+         * so if we clear i_update_core after they set it we
+         * are guaranteed to see their updates to the timestamps
+         * either here.  Likewise, if they set it after we clear it
+         * here, we'll see it either on the next commit of this
+         * inode or the next time the inode gets flushed via
+         * xfs_iflush().  This depends on strongly ordered memory
+         * semantics, but we have that.  We use the SYNCHRONIZE
+         * macro to make sure that the compiler does not reorder
+         * the i_update_core access below the data copy below.
+         */
+        if (ip->i_update_core)  {
+                ip->i_update_core = 0;
+                SYNCHRONIZE();
+        }
+        /*
+         * We don't have to worry about re-ordering here because
+         * the update_size field is protected by the inode lock
+         * and we have that held in exclusive mode.
+         */
+        if (ip->i_update_size)
+                ip->i_update_size = 0;
+        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
+        vecp->i_len  = sizeof(xfs_dinode_core_t);
+        vecp++;
+        nvecs++;
+        iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+        /*
+         * If this is really an old format inode, then we need to
+         * log it as such.  This means that we have to copy the link
+         * count from the new field to the old.  We don't have to worry
+         * about the new fields, because nothing trusts them as long as
+         * the old inode version number is there.  If the superblock already
+         * has a new version number, then we don't bother converting back.
+         */
+        mp = ip->i_mount;
+        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+               XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+                if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+                        /*
+                         * Convert it back.
+                         */
+                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+                        ip->i_d.di_onlink = ip->i_d.di_nlink;
+                } else {
+                        /*
+                         * The superblock version has already been bumped,
+                         * so just make the conversion to the new inode
+                         * format permanent.
+                         */
+                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        ip->i_d.di_onlink = 0;
+                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+                }
+        }
+        switch (ip->i_d.di_format) {
+        case XFS_DINODE_FMT_EXTENTS:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                          XFS_ILOG_DEV | XFS_ILOG_UUID)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
+                        ASSERT(ip->i_df.if_bytes > 0);
+                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
+                        ASSERT(ip->i_d.di_nextents > 0);
+                        ASSERT(iip->ili_extents_buf == NULL);
+                        nrecs = ip->i_df.if_bytes /
+                                (uint)sizeof(xfs_bmbt_rec_t);
+                        ASSERT(nrecs > 0);
+#if __BYTE_ORDER == __BIG_ENDIAN
+                        if (nrecs == ip->i_d.di_nextents) {
+                                /*
+                                 * There are no delayed allocation
+                                 * extents, so just point to the
+                                 * real extents array.
+                                 */
+                                vecp->i_addr =
+                                        (char *)(ip->i_df.if_u1.if_extents);
+                                vecp->i_len = ip->i_df.if_bytes;
+                        } else
+#endif
+                        {
+                                /*
+                                 * There are delayed allocation extents
+                                 * in the inode, or we need to convert
+                                 * the extents to on disk format.
+                                 * Use xfs_iextents_copy()
+                                 * to copy only the real extents into
+                                 * a separate buffer.  We'll free the
+                                 * buffer in the unlock routine.
+                                 */
+                                ext_buffer = kmem_alloc(ip->i_df.if_bytes,
+                                        KM_SLEEP);
+                                iip->ili_extents_buf = ext_buffer;
+                                vecp->i_addr = (xfs_caddr_t)ext_buffer;
+                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
+                                                XFS_DATA_FORK);
+                        }
+                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
+                        iip->ili_format.ilf_dsize = vecp->i_len;
+                        vecp++;
+                        nvecs++;
+                }
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+                          XFS_ILOG_DEV | XFS_ILOG_UUID)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
+                        ASSERT(ip->i_df.if_broot_bytes > 0);
+                        ASSERT(ip->i_df.if_broot != NULL);
+                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
+                        vecp->i_len = ip->i_df.if_broot_bytes;
+                        vecp++;
+                        nvecs++;
+                        iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+                }
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+                          XFS_ILOG_DEV | XFS_ILOG_UUID)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
+                        ASSERT(ip->i_df.if_bytes > 0);
+                        ASSERT(ip->i_df.if_u1.if_data != NULL);
+                        ASSERT(ip->i_d.di_size > 0);
+                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
+                        /*
+                         * Round i_bytes up to a word boundary.
+                         * The underlying memory is guaranteed to
+                         * to be there by xfs_idata_realloc().
+                         */
+                        data_bytes = roundup(ip->i_df.if_bytes, 4);
+                        ASSERT((ip->i_df.if_real_bytes == 0) ||
+                               (ip->i_df.if_real_bytes == data_bytes));
+                        vecp->i_len = (int)data_bytes;
+                        vecp++;
+                        nvecs++;
+                        iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+                }
+                break;
+        case XFS_DINODE_FMT_DEV:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+                          XFS_ILOG_DDATA | XFS_ILOG_UUID)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+                        iip->ili_format.ilf_u.ilfu_rdev =
+                                ip->i_df.if_u2.if_rdev;
+                }
+                break;
+        case XFS_DINODE_FMT_UUID:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+                          XFS_ILOG_DDATA | XFS_ILOG_DEV)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+                        iip->ili_format.ilf_u.ilfu_uuid =
+                                ip->i_df.if_u2.if_uuid;
+                }
+                break;
+        default:
+                ASSERT(0);
+                break;
+        }
+        /*
+         * If there are no attributes associated with the file,
+         * then we're done.
+         * Assert that no attribute-related log flags are set.
+         */
+        if (!XFS_IFORK_Q(ip)) {
+                ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+                iip->ili_format.ilf_size = nvecs;
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+                return;
+        }
+        switch (ip->i_d.di_aformat) {
+        case XFS_DINODE_FMT_EXTENTS:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
+                        ASSERT(ip->i_afp->if_bytes > 0);
+                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+                        ASSERT(ip->i_d.di_anextents > 0);
+#ifdef DEBUG
+                        nrecs = ip->i_afp->if_bytes /
+                                (uint)sizeof(xfs_bmbt_rec_t);
+#endif
+                        ASSERT(nrecs > 0);
+                        ASSERT(nrecs == ip->i_d.di_anextents);
+#if __BYTE_ORDER == __BIG_ENDIAN
+                        /*
+                         * There are not delayed allocation extents
+                         * for attributes, so just point at the array.
+                         */
+                        vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
+                        vecp->i_len = ip->i_afp->if_bytes;
+#else
+                        ASSERT(iip->ili_aextents_buf == NULL);
+                        /*
+                         * Need to endian flip before logging
+                         */
+                        ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
+                                KM_SLEEP);
+                        iip->ili_aextents_buf = ext_buffer;
+                        vecp->i_addr = (xfs_caddr_t)ext_buffer;
+                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
+                                        XFS_ATTR_FORK);
+#endif
+                        iip->ili_format.ilf_asize = vecp->i_len;
+                        vecp++;
+                        nvecs++;
+                }
+                break;
+        case XFS_DINODE_FMT_BTREE:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
+                        ASSERT(ip->i_afp->if_broot_bytes > 0);
+                        ASSERT(ip->i_afp->if_broot != NULL);
+                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
+                        vecp->i_len = ip->i_afp->if_broot_bytes;
+                        vecp++;
+                        nvecs++;
+                        iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+                }
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+                ASSERT(!(iip->ili_format.ilf_fields &
+                         (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+                if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
+                        ASSERT(ip->i_afp->if_bytes > 0);
+                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
+                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
+                        /*
+                         * Round i_bytes up to a word boundary.
+                         * The underlying memory is guaranteed to
+                         * to be there by xfs_idata_realloc().
+                         */
+                        data_bytes = roundup(ip->i_afp->if_bytes, 4);
+                        ASSERT((ip->i_afp->if_real_bytes == 0) ||
+                               (ip->i_afp->if_real_bytes == data_bytes));
+                        vecp->i_len = (int)data_bytes;
+                        vecp++;
+                        nvecs++;
+                        iip->ili_format.ilf_asize = (unsigned)data_bytes;
+                }
+                break;
+        default:
+                ASSERT(0);
+                break;
+        }
+        ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+        iip->ili_format.ilf_size = nvecs;
+}
+/*
+ * This is called to pin the inode associated with the inode log
+ * item in memory so it cannot be written out.  Do this by calling
+ * xfs_ipin() to bump the pin count in the inode while holding the
+ * inode pin lock.
+ */
+STATIC void
+xfs_inode_item_pin(
+        xfs_inode_log_item_t    *iip)
+{
+        ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+        xfs_ipin(iip->ili_inode);
+}
+/*
+ * This is called to unpin the inode associated with the inode log
+ * item which was previously pinned with a call to xfs_inode_item_pin().
+ * Just call xfs_iunpin() on the inode to do this.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_inode_item_unpin(
+        xfs_inode_log_item_t    *iip,
+        int                     stale)
+{
+        xfs_iunpin(iip->ili_inode);
+}
+/* ARGSUSED */
+STATIC void
+xfs_inode_item_unpin_remove(
+        xfs_inode_log_item_t    *iip,
+        xfs_trans_t             *tp)
+{
+        xfs_iunpin(iip->ili_inode);
+}
+/*
+ * This is called to attempt to lock the inode associated with this
+ * inode log item, in preparation for the push routine which does the actual
+ * iflush.  Don't sleep on the inode lock or the flush lock.
+ *
+ * If the flush lock is already held, indicating that the inode has
+ * been or is in the process of being flushed, then (ideally) we'd like to
+ * see if the inode's buffer is still incore, and if so give it a nudge.
+ * We delay doing so until the pushbuf routine, though, to avoid holding
+ * the AIL lock across a call to the blackhole which is the buffercache.
+ * Also we don't want to sleep in any device strategy routines, which can happen
+ * if we do the subsequent bawrite in here.
+ */
+STATIC uint
+xfs_inode_item_trylock(
+        xfs_inode_log_item_t    *iip)
+{
+        register xfs_inode_t    *ip;
+        ip = iip->ili_inode;
+        if (xfs_ipincount(ip) > 0) {
+                return XFS_ITEM_PINNED;
+        }
+        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                return XFS_ITEM_LOCKED;
+        }
+        if (!xfs_iflock_nowait(ip)) {
+                /*
+                 * If someone else isn't already trying to push the inode
+                 * buffer, we get to do it.
+                 */
+                if (iip->ili_pushbuf_flag == 0) {
+                        iip->ili_pushbuf_flag = 1;
+#ifdef DEBUG
+                        iip->ili_push_owner = get_thread_id();
+#endif
+                        /*
+                         * Inode is left locked in shared mode.
+                         * Pushbuf routine gets to unlock it.
+                         */
+                        return XFS_ITEM_PUSHBUF;
+                } else {
+                        /*
+                         * We hold the AIL_LOCK, so we must specify the
+                         * NONOTIFY flag so that we won't double trip.
+                         */
+                        xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+                        return XFS_ITEM_FLUSHING;
+                }
+                /* NOTREACHED */
+        }
+        /* Stale items should force out the iclog */
+        if (ip->i_flags & XFS_ISTALE) {
+                xfs_ifunlock(ip);
+                xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+                return XFS_ITEM_PINNED;
+        }
+#ifdef DEBUG
+        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                ASSERT(iip->ili_format.ilf_fields != 0);
+                ASSERT(iip->ili_logged == 0);
+                ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
+        }
+#endif
+        return XFS_ITEM_SUCCESS;
+}
+/*
+ * Unlock the inode associated with the inode log item.
+ * Clear the fields of the inode and inode log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the inode.
+ */
+STATIC void
+xfs_inode_item_unlock(
+        xfs_inode_log_item_t    *iip)
+{
+        uint            hold;
+        uint            iolocked;
+        uint            lock_flags;
+        xfs_inode_t     *ip;
+        ASSERT(iip != NULL);
+        ASSERT(iip->ili_inode->i_itemp != NULL);
+        ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+        ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
+                  XFS_ILI_IOLOCKED_EXCL)) ||
+               ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE));
+        ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
+                  XFS_ILI_IOLOCKED_SHARED)) ||
+               ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS));
+        /*
+         * Clear the transaction pointer in the inode.
+         */
+        ip = iip->ili_inode;
+        ip->i_transp = NULL;
+        /*
+         * If the inode needed a separate buffer with which to log
+         * its extents, then free it now.
+         */
+        if (iip->ili_extents_buf != NULL) {
+                ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
+                ASSERT(ip->i_d.di_nextents > 0);
+                ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+                ASSERT(ip->i_df.if_bytes > 0);
+                kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes);
+                iip->ili_extents_buf = NULL;
+        }
+        if (iip->ili_aextents_buf != NULL) {
+                ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
+                ASSERT(ip->i_d.di_anextents > 0);
+                ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
+                ASSERT(ip->i_afp->if_bytes > 0);
+                kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes);
+                iip->ili_aextents_buf = NULL;
+        }
+        /*
+         * Figure out if we should unlock the inode or not.
+         */
+        hold = iip->ili_flags & XFS_ILI_HOLD;
+        /*
+         * Before clearing out the flags, remember whether we
+         * are holding the inode's IO lock.
+         */
+        iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
+        /*
+         * Clear out the fields of the inode log item particular
+         * to the current transaction.
+         */
+        iip->ili_ilock_recur = 0;
+        iip->ili_iolock_recur = 0;
+        iip->ili_flags = 0;
+        /*
+         * Unlock the inode if XFS_ILI_HOLD was not set.
+         */
+        if (!hold) {
+                lock_flags = XFS_ILOCK_EXCL;
+                if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
+                        lock_flags |= XFS_IOLOCK_EXCL;
+                } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
+                        lock_flags |= XFS_IOLOCK_SHARED;
+                }
+                xfs_iput(iip->ili_inode, lock_flags);
+        }
+}
+/*
+ * This is called to find out where the oldest active copy of the
+ * inode log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.  Since we always re-log
+ * all dirty data in an inode, the latest copy in the on disk log
+ * is the only one that matters.  Therefore, simply return the
+ * given lsn.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_inode_item_committed(
+        xfs_inode_log_item_t    *iip,
+        xfs_lsn_t               lsn)
+{
+        return (lsn);
+}
+/*
+ * The transaction with the inode locked has aborted.  The inode
+ * must not be dirty within the transaction (unless we're forcibly
+ * shutting down).  We simply unlock just as if the transaction
+ * had been cancelled.
+ */
+STATIC void
+xfs_inode_item_abort(
+        xfs_inode_log_item_t    *iip)
+{
+        xfs_inode_item_unlock(iip);
+        return;
+}
+/*
+ * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
+ * failed to get the inode flush lock but did get the inode locked SHARED.
+ * Here we're trying to see if the inode buffer is incore, and if so whether it's
+ * marked delayed write. If that's the case, we'll initiate a bawrite on that
+ * buffer to expedite the process.
+ *
+ * We aren't holding the AIL_LOCK (or the flush lock) when this gets called,
+ * so it is inherently race-y.
+ */
+STATIC void
+xfs_inode_item_pushbuf(
+        xfs_inode_log_item_t    *iip)
+{
+        xfs_inode_t     *ip;
+        xfs_mount_t     *mp;
+        xfs_buf_t       *bp;
+        uint            dopush;
+        ip = iip->ili_inode;
+        ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+        /*
+         * The ili_pushbuf_flag keeps others from
+         * trying to duplicate our effort.
+         */
+        ASSERT(iip->ili_pushbuf_flag != 0);
+        ASSERT(iip->ili_push_owner == get_thread_id());
+        /*
+         * If flushlock isn't locked anymore, chances are that the
+         * inode flush completed and the inode was taken off the AIL.
+         * So, just get out.
+         */
+        if ((valusema(&(ip->i_flock)) > 0)  ||
+            ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+                iip->ili_pushbuf_flag = 0;
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                return;
+        }
+        mp = ip->i_mount;
+        bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
+                    iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
+        if (bp != NULL) {
+                if (XFS_BUF_ISDELAYWRITE(bp)) {
+                        /*
+                         * We were racing with iflush because we don't hold
+                         * the AIL_LOCK or the flush lock. However, at this point,
+                         * we have the buffer, and we know that it's dirty.
+                         * So, it's possible that iflush raced with us, and
+                         * this item is already taken off the AIL.
+                         * If not, we can flush it async.
+                         */
+                        dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
+                                  (valusema(&(ip->i_flock)) <= 0));
+                        iip->ili_pushbuf_flag = 0;
+                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                        xfs_buftrace("INODE ITEM PUSH", bp);
+                        if (XFS_BUF_ISPINNED(bp)) {
+                                xfs_log_force(mp, (xfs_lsn_t)0,
+                                              XFS_LOG_FORCE);
+                        }
+                        if (dopush) {
+                                xfs_bawrite(mp, bp);
+                        } else {
+                                xfs_buf_relse(bp);
+                        }
+                } else {
+                        iip->ili_pushbuf_flag = 0;
+                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                        xfs_buf_relse(bp);
+                }
+                return;
+        }
+        /*
+         * We have to be careful about resetting pushbuf flag too early (above).
+         * Even though in theory we can do it as soon as we have the buflock,
+         * we don't want others to be doing work needlessly. They'll come to
+         * this function thinking that pushing the buffer is their
+         * responsibility only to find that the buffer is still locked by
+         * another doing the same thing
+         */
+        iip->ili_pushbuf_flag = 0;
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return;
+}
+/*
+ * This is called to asynchronously write the inode associated with this
+ * inode log item out to disk. The inode will already have been locked by
+ * a successful call to xfs_inode_item_trylock().
+ */
+STATIC void
+xfs_inode_item_push(
+        xfs_inode_log_item_t    *iip)
+{
+        xfs_inode_t     *ip;
+        ip = iip->ili_inode;
+        ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+        ASSERT(valusema(&(ip->i_flock)) <= 0);
+        /*
+         * Since we were able to lock the inode's flush lock and
+         * we found it on the AIL, the inode must be dirty.  This
+         * is because the inode is removed from the AIL while still
+         * holding the flush lock in xfs_iflush_done().  Thus, if
+         * we found it in the AIL and were able to obtain the flush
+         * lock without sleeping, then there must not have been
+         * anyone in the process of flushing the inode.
+         */
+        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+               iip->ili_format.ilf_fields != 0);
+        /*
+         * Write out the inode.  The completion routine ('iflush_done') will
+         * pull it from the AIL, mark it clean, unlock the flush lock.
+         */
+        (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return;
+}
+/*
+ * XXX rcc - this one really has to do something.  Probably needs
+ * to stamp in a new field in the incore inode.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_inode_item_committing(
+        xfs_inode_log_item_t    *iip,
+        xfs_lsn_t               lsn)
+{
+        iip->ili_last_lsn = lsn;
+        return;
+}
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+struct xfs_item_ops xfs_inode_item_ops = {
+        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
+        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+                                        xfs_inode_item_format,
+        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
+        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin,
+        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+                                        xfs_inode_item_unpin_remove,
+        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
+        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
+        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_inode_item_committed,
+        .iop_push       = (void(*)(xfs_log_item_t*))xfs_inode_item_push,
+        .iop_abort      = (void(*)(xfs_log_item_t*))xfs_inode_item_abort,
+        .iop_pushbuf    = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
+        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+                                        xfs_inode_item_committing
+};
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ */
+void
+xfs_inode_item_init(
+        xfs_inode_t     *ip,
+        xfs_mount_t     *mp)
+{
+        xfs_inode_log_item_t    *iip;
+        ASSERT(ip->i_itemp == NULL);
+        iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+        iip->ili_item.li_type = XFS_LI_INODE;
+        iip->ili_item.li_ops = &xfs_inode_item_ops;
+        iip->ili_item.li_mountp = mp;
+        iip->ili_inode = ip;
+        /*
+           We have zeroed memory. No need ...
+           iip->ili_extents_buf = NULL;
+           iip->ili_pushbuf_flag = 0;
+         */
+        iip->ili_format.ilf_type = XFS_LI_INODE;
+        iip->ili_format.ilf_ino = ip->i_ino;
+        iip->ili_format.ilf_blkno = ip->i_blkno;
+        iip->ili_format.ilf_len = ip->i_len;
+        iip->ili_format.ilf_boffset = ip->i_boffset;
+}
+/*
+ * Free the inode log item and any memory hanging off of it.
+ */
+void
+xfs_inode_item_destroy(
+        xfs_inode_t     *ip)
+{
+#ifdef XFS_TRANS_DEBUG
+        if (ip->i_itemp->ili_root_size != 0) {
+                kmem_free(ip->i_itemp->ili_orig_root,
+                          ip->i_itemp->ili_root_size);
+        }
+#endif
+        kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+}
+/*
+ * This is the inode flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the inode is
+ * flushed to disk.  It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+/*ARGSUSED*/
+void
+xfs_iflush_done(
+        xfs_buf_t               *bp,
+        xfs_inode_log_item_t    *iip)
+{
+        xfs_inode_t     *ip;
+        SPLDECL(s);
+        ip = iip->ili_inode;
+        /*
+         * We only want to pull the item from the AIL if it is
+         * actually there and its location in the log has not
+         * changed since we started the flush.  Thus, we only bother
+         * if the ili_logged flag is set and the inode's lsn has not
+         * changed.  First we check the lsn outside
+         * the lock since it's cheaper, and then we recheck while
+         * holding the lock before removing the inode from the AIL.
+         */
+        if (iip->ili_logged &&
+            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
+                AIL_LOCK(ip->i_mount, s);
+                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
+                        /*
+                         * xfs_trans_delete_ail() drops the AIL lock.
+                         */
+                        xfs_trans_delete_ail(ip->i_mount,
+                                             (xfs_log_item_t*)iip, s);
+                } else {
+                        AIL_UNLOCK(ip->i_mount, s);
+                }
+        }
+        iip->ili_logged = 0;
+        /*
+         * Clear the ili_last_fields bits now that we know that the
+         * data corresponding to them is safely on disk.
+         */
+        iip->ili_last_fields = 0;
+        /*
+         * Release the inode's flush lock since we're done with it.
+         */
+        xfs_ifunlock(ip);
+        return;
+}
+/*
+ * This is the inode flushing abort routine.  It is called
+ * from xfs_iflush when the filesystem is shutting down to clean
+ * up the inode state.
+ * It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+void
+xfs_iflush_abort(
+        xfs_inode_t             *ip)
+{
+        xfs_inode_log_item_t    *iip;
+        xfs_mount_t             *mp;
+        SPLDECL(s);
+        iip = ip->i_itemp;
+        mp = ip->i_mount;
+        if (iip) {
+                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+                        AIL_LOCK(mp, s);
+                        if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+                                /*
+                                 * xfs_trans_delete_ail() drops the AIL lock.
+                                 */
+                                xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip,
+                                        s);
+                        } else
+                                AIL_UNLOCK(mp, s);
+                }
+                iip->ili_logged = 0;
+                /*
+                 * Clear the ili_last_fields bits now that we know that the
+                 * data corresponding to them is safely on disk.
+                 */
+                iip->ili_last_fields = 0;
+                /*
+                 * Clear the inode logging fields so no more flushes are
+                 * attempted.
+                 */
+                iip->ili_format.ilf_fields = 0;
+        }
+        /*
+         * Release the inode's flush lock since we're done with it.
+         */
+        xfs_ifunlock(ip);
+}
+void
+xfs_istale_done(
+        xfs_buf_t               *bp,
+        xfs_inode_log_item_t    *iip)
+{
+        xfs_iflush_abort(iip->ili_inode);
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
new file mode 100644
index 000000000000..d8775e0d6291
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INODE_ITEM_H__
+#define __XFS_INODE_ITEM_H__
+/*
+ * This is the structure used to lay out an inode log item in the
+ * log.  The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ *
+ * Convention for naming inode log item versions :  The current version
+ * is always named XFS_LI_INODE.  When an inode log item gets superseded,
+ * add the latest version of IRIX that will generate logs with that item
+ * to the version name.
+ *
+ * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first
+ *      union (ilf_u) field.  This was released with IRIX 5.3-XFS.
+ * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire
+ *      structure.  This was released with IRIX 6.0.1-XFS and IRIX 6.1.
+ * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2
+ *      so a new structure definition wasn't necessary.  However, we had
+ *      to add a new type because the inode cluster size changed from 4K
+ *      to 8K and the version number had to be rev'ved to keep older kernels
+ *      from trying to recover logs with the 8K buffers in them.  The logging
+ *      code can handle recovery on different-sized clusters now so hopefully
+ *      this'll be the last time we need to change the inode log item just
+ *      for a change in the inode cluster size.  This new version was
+ *      released with IRIX 6.2.
+ */
+typedef struct xfs_inode_log_format {
+        unsigned short          ilf_type;       /* inode log item type */
+        unsigned short          ilf_size;       /* size of this item */
+        uint                    ilf_fields;     /* flags for fields logged */
+        ushort                  ilf_asize;      /* size of attr d/ext/root */
+        ushort                  ilf_dsize;      /* size of data/ext/root */
+        xfs_ino_t               ilf_ino;        /* inode number */
+        union {
+                xfs_dev_t       ilfu_rdev;      /* rdev value for dev inode*/
+                uuid_t          ilfu_uuid;      /* mount point value */
+        } ilf_u;
+        __int64_t               ilf_blkno;      /* blkno of inode buffer */
+        int                     ilf_len;        /* len of inode buffer */
+        int                     ilf_boffset;    /* off of inode in buffer */
+} xfs_inode_log_format_t;
+/* Initial version shipped with IRIX 5.3-XFS */
+typedef struct xfs_inode_log_format_v1 {
+        unsigned short          ilf_type;       /* inode log item type */
+        unsigned short          ilf_size;       /* size of this item */
+        uint                    ilf_fields;     /* flags for fields logged */
+        uint                    ilf_dsize;      /* size of data/ext/root */
+        xfs_ino_t               ilf_ino;        /* inode number */
+        union {
+                xfs_dev_t       ilfu_rdev;      /* rdev value for dev inode*/
+                uuid_t          ilfu_uuid;      /* mount point value */
+        } ilf_u;
+} xfs_inode_log_format_t_v1;
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define XFS_ILOG_CORE   0x001   /* log standard inode fields */
+#define XFS_ILOG_DDATA  0x002   /* log i_df.if_data */
+#define XFS_ILOG_DEXT   0x004   /* log i_df.if_extents */
+#define XFS_ILOG_DBROOT 0x008   /* log i_df.i_broot */
+#define XFS_ILOG_DEV    0x010   /* log the dev field */
+#define XFS_ILOG_UUID   0x020   /* log the uuid field */
+#define XFS_ILOG_ADATA  0x040   /* log i_af.if_data */
+#define XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
+#define XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
+#define XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+                                 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+                                 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+#define XFS_ILOG_DFORK          (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                 XFS_ILOG_DBROOT)
+#define XFS_ILOG_AFORK          (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                 XFS_ILOG_ABROOT)
+#define XFS_ILOG_ALL            (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+                                 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+                                 XFS_ILOG_DEV | XFS_ILOG_UUID | \
+                                 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+                                 XFS_ILOG_ABROOT)
+#define XFS_ILI_HOLD            0x1
+#define XFS_ILI_IOLOCKED_EXCL   0x2
+#define XFS_ILI_IOLOCKED_SHARED 0x4
+#define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
+#ifdef __KERNEL__
+struct xfs_buf;
+struct xfs_bmbt_rec_64;
+struct xfs_inode;
+struct xfs_mount;
+typedef struct xfs_inode_log_item {
+        xfs_log_item_t          ili_item;          /* common portion */
+        struct xfs_inode        *ili_inode;        /* inode ptr */
+        xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
+        xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
+        unsigned short          ili_ilock_recur;   /* lock recursion count */
+        unsigned short          ili_iolock_recur;  /* lock recursion count */
+        unsigned short          ili_flags;         /* misc flags */
+        unsigned short          ili_logged;        /* flushed logged data */
+        unsigned int            ili_last_fields;   /* fields when flushed */
+        struct xfs_bmbt_rec_64  *ili_extents_buf;  /* array of logged
+                                                      data exts */
+        struct xfs_bmbt_rec_64  *ili_aextents_buf; /* array of logged
+                                                      attr exts */
+        unsigned int            ili_pushbuf_flag;  /* one bit used in push_ail */
+#ifdef DEBUG
+        uint64_t                ili_push_owner;    /* one who sets pushbuf_flag
+                                                      above gets to push the buf */
+#endif
+#ifdef XFS_TRANS_DEBUG
+        int                     ili_root_size;
+        char                    *ili_orig_root;
+#endif
+        xfs_inode_log_format_t  ili_format;        /* logged structure */
+} xfs_inode_log_item_t;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FDATA)
+int xfs_ilog_fdata(int w);
+#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
+#else
+#define XFS_ILOG_FDATA(w)       \
+        ((w) == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA)
+#endif
+#endif  /* __KERNEL__ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FBROOT)
+int xfs_ilog_fbroot(int w);
+#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
+#else
+#define XFS_ILOG_FBROOT(w)      \
+        ((w) == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FEXT)
+int xfs_ilog_fext(int w);
+#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
+#else
+#define XFS_ILOG_FEXT(w)        \
+        ((w) == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT)
+#endif
+#ifdef __KERNEL__
+void    xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
+void    xfs_inode_item_destroy(struct xfs_inode *);
+void    xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
+void    xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *);
+void    xfs_iflush_abort(struct xfs_inode *);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
new file mode 100644
index 000000000000..a3af2d5a6eb7
--- /dev/null
+++ b/fs/xfs/xfs_inum.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INUM_H__
+#define __XFS_INUM_H__
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+typedef __uint32_t      xfs_agino_t;    /* within allocation grp inode number */
+/*
+ * Useful inode bits for this kernel.
+ * Used in some places where having 64-bits in the 32-bit kernels
+ * costs too much.
+ */
+#if XFS_BIG_INUMS
+typedef xfs_ino_t       xfs_intino_t;
+#else
+typedef __uint32_t      xfs_intino_t;
+#endif
+#define NULLFSINO       ((xfs_ino_t)-1)
+#define NULLAGINO       ((xfs_agino_t)-1)
+struct xfs_mount;
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_MASK)
+__uint32_t xfs_ino_mask(int k);
+#define XFS_INO_MASK(k)                 xfs_ino_mask(k)
+#else
+#define XFS_INO_MASK(k) ((__uint32_t)((1ULL << (k)) - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_OFFSET_BITS)
+int xfs_ino_offset_bits(struct xfs_mount *mp);
+#define XFS_INO_OFFSET_BITS(mp)         xfs_ino_offset_bits(mp)
+#else
+#define XFS_INO_OFFSET_BITS(mp) ((mp)->m_sb.sb_inopblog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGBNO_BITS)
+int xfs_ino_agbno_bits(struct xfs_mount *mp);
+#define XFS_INO_AGBNO_BITS(mp)          xfs_ino_agbno_bits(mp)
+#else
+#define XFS_INO_AGBNO_BITS(mp)  ((mp)->m_sb.sb_agblklog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGINO_BITS)
+int xfs_ino_agino_bits(struct xfs_mount *mp);
+#define XFS_INO_AGINO_BITS(mp)          xfs_ino_agino_bits(mp)
+#else
+#define XFS_INO_AGINO_BITS(mp)          ((mp)->m_agino_log)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGNO_BITS)
+int xfs_ino_agno_bits(struct xfs_mount *mp);
+#define XFS_INO_AGNO_BITS(mp)           xfs_ino_agno_bits(mp)
+#else
+#define XFS_INO_AGNO_BITS(mp)   ((mp)->m_agno_log)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_BITS)
+int xfs_ino_bits(struct xfs_mount *mp);
+#define XFS_INO_BITS(mp)                xfs_ino_bits(mp)
+#else
+#define XFS_INO_BITS(mp)        (XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGNO)
+xfs_agnumber_t xfs_ino_to_agno(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGNO(mp,i)           xfs_ino_to_agno(mp,i)
+#else
+#define XFS_INO_TO_AGNO(mp,i)   \
+        ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGINO)
+xfs_agino_t xfs_ino_to_agino(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGINO(mp,i)          xfs_ino_to_agino(mp,i)
+#else
+#define XFS_INO_TO_AGINO(mp,i)  \
+        ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGBNO)
+xfs_agblock_t xfs_ino_to_agbno(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGBNO(mp,i)          xfs_ino_to_agbno(mp,i)
+#else
+#define XFS_INO_TO_AGBNO(mp,i)  \
+        (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+         XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_OFFSET)
+int xfs_ino_to_offset(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_OFFSET(mp,i)         xfs_ino_to_offset(mp,i)
+#else
+#define XFS_INO_TO_OFFSET(mp,i) \
+        ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_FSB)
+xfs_fsblock_t xfs_ino_to_fsb(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_FSB(mp,i)            xfs_ino_to_fsb(mp,i)
+#else
+#define XFS_INO_TO_FSB(mp,i)    \
+        XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_INO)
+xfs_ino_t
+xfs_agino_to_ino(struct xfs_mount *mp, xfs_agnumber_t a, xfs_agino_t i);
+#define XFS_AGINO_TO_INO(mp,a,i)        xfs_agino_to_ino(mp,a,i)
+#else
+#define XFS_AGINO_TO_INO(mp,a,i)        \
+        (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_AGBNO)
+xfs_agblock_t xfs_agino_to_agbno(struct xfs_mount *mp, xfs_agino_t i);
+#define XFS_AGINO_TO_AGBNO(mp,i)        xfs_agino_to_agbno(mp,i)
+#else
+#define XFS_AGINO_TO_AGBNO(mp,i)        ((i) >> XFS_INO_OFFSET_BITS(mp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_OFFSET)
+int xfs_agino_to_offset(struct xfs_mount *mp, xfs_agino_t i);
+#define XFS_AGINO_TO_OFFSET(mp,i)       xfs_agino_to_offset(mp,i)
+#else
+#define XFS_AGINO_TO_OFFSET(mp,i)       \
+        ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_OFFBNO_TO_AGINO)
+xfs_agino_t xfs_offbno_to_agino(struct xfs_mount *mp, xfs_agblock_t b, int o);
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)     xfs_offbno_to_agino(mp,b,o)
+#else
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)     \
+        ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#endif
+#if XFS_BIG_INUMS
+#define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define XFS_INO64_OFFSET        ((xfs_ino_t)(1ULL << 32))
+#else
+#define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif
+#define XFS_MAXINUMBER_32       ((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif  /* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
new file mode 100644
index 000000000000..414ec496845f
--- /dev/null
+++ b/fs/xfs/xfs_iocore.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_itable.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_rw.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+#include "xfs_iomap.h"
+STATIC xfs_fsize_t
+xfs_size_fn(
+        xfs_inode_t             *ip)
+{
+        return (ip->i_d.di_size);
+}
+STATIC int
+xfs_ioinit(
+        struct vfs              *vfsp,
+        struct xfs_mount_args   *mntargs,
+        int                     flags)
+{
+        return xfs_mountfs(vfsp, XFS_VFSTOM(vfsp), flags);
+}
+xfs_ioops_t     xfs_iocore_xfs = {
+        .xfs_ioinit             = (xfs_ioinit_t) xfs_ioinit,
+        .xfs_bmapi_func         = (xfs_bmapi_t) xfs_bmapi,
+        .xfs_bmap_eof_func      = (xfs_bmap_eof_t) xfs_bmap_eof,
+        .xfs_iomap_write_direct =
+                        (xfs_iomap_write_direct_t) xfs_iomap_write_direct,
+        .xfs_iomap_write_delay =
+                        (xfs_iomap_write_delay_t) xfs_iomap_write_delay,
+        .xfs_iomap_write_allocate =
+                        (xfs_iomap_write_allocate_t) xfs_iomap_write_allocate,
+        .xfs_iomap_write_unwritten =
+                        (xfs_iomap_write_unwritten_t) xfs_iomap_write_unwritten,
+        .xfs_ilock              = (xfs_lock_t) xfs_ilock,
+        .xfs_lck_map_shared     = (xfs_lck_map_shared_t) xfs_ilock_map_shared,
+        .xfs_ilock_demote       = (xfs_lock_demote_t) xfs_ilock_demote,
+        .xfs_ilock_nowait       = (xfs_lock_nowait_t) xfs_ilock_nowait,
+        .xfs_unlock             = (xfs_unlk_t) xfs_iunlock,
+        .xfs_size_func          = (xfs_size_t) xfs_size_fn,
+        .xfs_iodone             = (xfs_iodone_t) fs_noerr,
+};
+void
+xfs_iocore_inode_reinit(
+        xfs_inode_t     *ip)
+{
+        xfs_iocore_t    *io = &ip->i_iocore;
+        io->io_flags = 0;
+        if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
+                io->io_flags |= XFS_IOCORE_RT;
+        io->io_dmevmask = ip->i_d.di_dmevmask;
+        io->io_dmstate = ip->i_d.di_dmstate;
+}
+void
+xfs_iocore_inode_init(
+        xfs_inode_t     *ip)
+{
+        xfs_iocore_t    *io = &ip->i_iocore;
+        xfs_mount_t     *mp = ip->i_mount;
+        io->io_mount = mp;
+#ifdef DEBUG
+        io->io_lock = &ip->i_lock;
+        io->io_iolock = &ip->i_iolock;
+#endif
+        io->io_obj = (void *)ip;
+        xfs_iocore_inode_reinit(ip);
+}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
new file mode 100644
index 000000000000..3826e8f0e28a
--- /dev/null
+++ b/fs/xfs/xfs_iomap.c
@@ -0,0 +1,1000 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_dmapi.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_acl.h"
+#include "xfs_cap.h"
+#include "xfs_mac.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_iomap.h"
+#if defined(XFS_RW_TRACE)
+void
+xfs_iomap_enter_trace(
+        int             tag,
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,
+        ssize_t         count)
+{
+        xfs_inode_t     *ip = XFS_IO_INODE(io);
+        if (!ip->i_rwtrace)
+                return;
+        ktrace_enter(ip->i_rwtrace,
+                (void *)((unsigned long)tag),
+                (void *)ip,
+                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(offset & 0xffffffff)),
+                (void *)((unsigned long)count),
+                (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL,
+                (void *)NULL);
+}
+void
+xfs_iomap_map_trace(
+        int             tag,
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,
+        ssize_t         count,
+        xfs_iomap_t     *iomapp,
+        xfs_bmbt_irec_t *imapp,
+        int             flags)
+{
+        xfs_inode_t     *ip = XFS_IO_INODE(io);
+        if (!ip->i_rwtrace)
+                return;
+        ktrace_enter(ip->i_rwtrace,
+                (void *)((unsigned long)tag),
+                (void *)ip,
+                (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
+                (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(offset & 0xffffffff)),
+                (void *)((unsigned long)count),
+                (void *)((unsigned long)flags),
+                (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)),
+                (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)),
+                (void *)((unsigned long)(iomapp->iomap_delta)),
+                (void *)((unsigned long)(iomapp->iomap_bsize)),
+                (void *)((unsigned long)(iomapp->iomap_bn)),
+                (void *)(__psint_t)(imapp->br_startoff),
+                (void *)((unsigned long)(imapp->br_blockcount)),
+                (void *)(__psint_t)(imapp->br_startblock));
+}
+#else
+#define xfs_iomap_enter_trace(tag, io, offset, count)
+#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags)
+#endif
+#define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
+                                                << mp->m_writeio_log)
+#define XFS_STRAT_WRITE_IMAPS   2
+#define XFS_WRITE_IMAPS         XFS_BMAP_MAX_NMAP
+STATIC int
+xfs_imap_to_bmap(
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,
+        xfs_bmbt_irec_t *imap,
+        xfs_iomap_t     *iomapp,
+        int             imaps,                  /* Number of imap entries */
+        int             iomaps,                 /* Number of iomap entries */
+        int             flags)
+{
+        xfs_mount_t     *mp;
+        xfs_fsize_t     nisize;
+        int             pbm;
+        xfs_fsblock_t   start_block;
+        mp = io->io_mount;
+        nisize = XFS_SIZE(mp, io);
+        if (io->io_new_size > nisize)
+                nisize = io->io_new_size;
+        for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) {
+                iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+                iomapp->iomap_delta = offset - iomapp->iomap_offset;
+                iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
+                iomapp->iomap_flags = flags;
+                if (io->io_flags & XFS_IOCORE_RT) {
+                        iomapp->iomap_flags |= IOMAP_REALTIME;
+                        iomapp->iomap_target = mp->m_rtdev_targp;
+                } else {
+                        iomapp->iomap_target = mp->m_ddev_targp;
+                }
+                start_block = imap->br_startblock;
+                if (start_block == HOLESTARTBLOCK) {
+                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
+                        iomapp->iomap_flags |= IOMAP_HOLE;
+                } else if (start_block == DELAYSTARTBLOCK) {
+                        iomapp->iomap_bn = IOMAP_DADDR_NULL;
+                        iomapp->iomap_flags |= IOMAP_DELAY;
+                } else {
+                        iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block);
+                        if (ISUNWRITTEN(imap))
+                                iomapp->iomap_flags |= IOMAP_UNWRITTEN;
+                }
+                if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) {
+                        iomapp->iomap_flags |= IOMAP_EOF;
+                }
+                offset += iomapp->iomap_bsize - iomapp->iomap_delta;
+        }
+        return pbm;     /* Return the number filled */
+}
+int
+xfs_iomap(
+        xfs_iocore_t    *io,
+        xfs_off_t       offset,
+        ssize_t         count,
+        int             flags,
+        xfs_iomap_t     *iomapp,
+        int             *niomaps)
+{
+        xfs_mount_t     *mp = io->io_mount;
+        xfs_fileoff_t   offset_fsb, end_fsb;
+        int             error = 0;
+        int             lockmode = 0;
+        xfs_bmbt_irec_t imap;
+        int             nimaps = 1;
+        int             bmapi_flags = 0;
+        int             iomap_flags = 0;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        switch (flags &
+                (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE |
+                 BMAPI_UNWRITTEN | BMAPI_DEVICE)) {
+        case BMAPI_READ:
+                xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count);
+                lockmode = XFS_LCK_MAP_SHARED(mp, io);
+                bmapi_flags = XFS_BMAPI_ENTIRE;
+                if (flags & BMAPI_IGNSTATE)
+                        bmapi_flags |= XFS_BMAPI_IGSTATE;
+                break;
+        case BMAPI_WRITE:
+                xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count);
+                lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
+                bmapi_flags = 0;
+                XFS_ILOCK(mp, io, lockmode);
+                break;
+        case BMAPI_ALLOCATE:
+                xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, io, offset, count);
+                lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
+                bmapi_flags = XFS_BMAPI_ENTIRE;
+                /* Attempt non-blocking lock */
+                if (flags & BMAPI_TRYLOCK) {
+                        if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
+                                return XFS_ERROR(EAGAIN);
+                } else {
+                        XFS_ILOCK(mp, io, lockmode);
+                }
+                break;
+        case BMAPI_UNWRITTEN:
+                goto phase2;
+        case BMAPI_DEVICE:
+                lockmode = XFS_LCK_MAP_SHARED(mp, io);
+                iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ?
+                        mp->m_rtdev_targp : mp->m_ddev_targp;
+                error = 0;
+                *niomaps = 1;
+                goto out;
+        default:
+                BUG();
+        }
+        ASSERT(offset <= mp->m_maxioffset);
+        if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
+                count = mp->m_maxioffset - offset;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+                        (xfs_filblks_t)(end_fsb - offset_fsb),
+                        bmapi_flags,  NULL, 0, &imap,
+                        &nimaps, NULL);
+        if (error)
+                goto out;
+phase2:
+        switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) {
+        case BMAPI_WRITE:
+                /* If we found an extent, return it */
+                if (nimaps && (imap.br_startblock != HOLESTARTBLOCK)) {
+                        xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
+                                        offset, count, iomapp, &imap, flags);
+                        break;
+                }
+                if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) {
+                        error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset,
+                                        count, flags, &imap, &nimaps, nimaps);
+                } else {
+                        error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count,
+                                        flags, &imap, &nimaps);
+                }
+                if (!error) {
+                        xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, io,
+                                        offset, count, iomapp, &imap, flags);
+                }
+                iomap_flags = IOMAP_NEW;
+                break;
+        case BMAPI_ALLOCATE:
+                /* If we found an extent, return it */
+                XFS_IUNLOCK(mp, io, lockmode);
+                lockmode = 0;
+                if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) {
+                        xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io,
+                                        offset, count, iomapp, &imap, flags);
+                        break;
+                }
+                error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, &imap, &nimaps);
+                break;
+        case BMAPI_UNWRITTEN:
+                lockmode = 0;
+                error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count);
+                nimaps = 0;
+                break;
+        }
+        if (nimaps) {
+                *niomaps = xfs_imap_to_bmap(io, offset, &imap,
+                                            iomapp, nimaps, *niomaps, iomap_flags);
+        } else if (niomaps) {
+                *niomaps = 0;
+        }
+out:
+        if (lockmode)
+                XFS_IUNLOCK(mp, io, lockmode);
+        return XFS_ERROR(error);
+}
+STATIC int
+xfs_flush_space(
+        xfs_inode_t     *ip,
+        int             *fsynced,
+        int             *ioflags)
+{
+        switch (*fsynced) {
+        case 0:
+                if (ip->i_delayed_blks) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        xfs_flush_inode(ip);
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        *fsynced = 1;
+                } else {
+                        *ioflags |= BMAPI_SYNC;
+                        *fsynced = 2;
+                }
+                return 0;
+        case 1:
+                *fsynced = 2;
+                *ioflags |= BMAPI_SYNC;
+                return 0;
+        case 2:
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_flush_device(ip);
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                *fsynced = 3;
+                return 0;
+        }
+        return 1;
+}
+int
+xfs_iomap_write_direct(
+        xfs_inode_t     *ip,
+        loff_t          offset,
+        size_t          count,
+        int             flags,
+        xfs_bmbt_irec_t *ret_imap,
+        int             *nmaps,
+        int             found)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_iocore_t    *io = &ip->i_iocore;
+        xfs_fileoff_t   offset_fsb;
+        xfs_fileoff_t   last_fsb;
+        xfs_filblks_t   count_fsb;
+        xfs_fsize_t     isize;
+        xfs_fsblock_t   firstfsb;
+        int             nimaps, maps;
+        int             error;
+        int             bmapi_flag;
+        int             rt;
+        xfs_trans_t     *tp;
+        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
+        xfs_bmap_free_t free_list;
+        int             aeof;
+        xfs_filblks_t   datablocks;
+        int             committed;
+        int             numrtextents;
+        uint            resblks;
+        /*
+         * Make sure that the dquots are there. This doesn't hold
+         * the ilock across a disk read.
+         */
+        error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
+        if (error)
+                return XFS_ERROR(error);
+        maps = min(XFS_WRITE_IMAPS, *nmaps);
+        nimaps = maps;
+        isize = ip->i_d.di_size;
+        aeof = (offset + count) > isize;
+        if (io->io_new_size > isize)
+                isize = io->io_new_size;
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+        count_fsb = last_fsb - offset_fsb;
+        if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) {
+                xfs_fileoff_t   map_last_fsb;
+                map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff;
+                if (map_last_fsb < last_fsb) {
+                        last_fsb = map_last_fsb;
+                        count_fsb = last_fsb - offset_fsb;
+                }
+                ASSERT(count_fsb > 0);
+        }
+        /*
+         * determine if reserving space on
+         * the data or realtime partition.
+         */
+        if ((rt = XFS_IS_REALTIME_INODE(ip))) {
+                int     sbrtextsize, iprtextsize;
+                sbrtextsize = mp->m_sb.sb_rextsize;
+                iprtextsize =
+                        ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize;
+                numrtextents = (count_fsb + iprtextsize - 1);
+                do_div(numrtextents, sbrtextsize);
+                datablocks = 0;
+        } else {
+                datablocks = count_fsb;
+                numrtextents = 0;
+        }
+        /*
+         * allocate and setup the transaction
+         */
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+        resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+        error = xfs_trans_reserve(tp, resblks,
+                        XFS_WRITE_LOG_RES(mp), numrtextents,
+                        XFS_TRANS_PERM_LOG_RES,
+                        XFS_WRITE_LOG_COUNT);
+        /*
+         * check for running out of space
+         */
+        if (error)
+                /*
+                 * Free the transaction structure.
+                 */
+                xfs_trans_cancel(tp, 0);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        if (error)
+                goto error_out; /* Don't return in above if .. trans ..,
+                                        need lock to return */
+        if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) {
+                error = (EDQUOT);
+                goto error1;
+        }
+        nimaps = 1;
+        bmapi_flag = XFS_BMAPI_WRITE;
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt))
+                bmapi_flag |= XFS_BMAPI_PREALLOC;
+        /*
+         * issue the bmapi() call to allocate the blocks
+         */
+        XFS_BMAP_INIT(&free_list, &firstfsb);
+        imapp = &imap[0];
+        error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+                bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list);
+        if (error) {
+                goto error0;
+        }
+        /*
+         * complete the transaction
+         */
+        error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+        if (error) {
+                goto error0;
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        if (error) {
+                goto error_out;
+        }
+        /* copy any maps to caller's array and return any error. */
+        if (nimaps == 0) {
+                error = (ENOSPC);
+                goto error_out;
+        }
+        *ret_imap = imap[0];
+        *nmaps = 1;
+        if ( !(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
+                cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
+                        "start_block : %llx start_off : %llx blkcnt : %llx "
+                        "extent-state : %x \n",
+                        (ip->i_mount)->m_fsname,
+                        (long long)ip->i_ino,
+                        ret_imap->br_startblock, ret_imap->br_startoff,
+                        ret_imap->br_blockcount,ret_imap->br_state);
+        }
+        return 0;
+ error0:        /* Cancel bmap, unlock inode, and cancel trans */
+        xfs_bmap_cancel(&free_list);
+ error1:        /* Just cancel transaction */
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+        *nmaps = 0;     /* nothing set-up here */
+error_out:
+        return XFS_ERROR(error);
+}
+int
+xfs_iomap_write_delay(
+        xfs_inode_t     *ip,
+        loff_t          offset,
+        size_t          count,
+        int             ioflag,
+        xfs_bmbt_irec_t *ret_imap,
+        int             *nmaps)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_iocore_t    *io = &ip->i_iocore;
+        xfs_fileoff_t   offset_fsb;
+        xfs_fileoff_t   last_fsb;
+        xfs_fsize_t     isize;
+        xfs_fsblock_t   firstblock;
+        int             nimaps;
+        int             error;
+        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+        int             aeof;
+        int             fsynced = 0;
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+        /*
+         * Make sure that the dquots are there. This doesn't hold
+         * the ilock across a disk read.
+         */
+        error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+        if (error)
+                return XFS_ERROR(error);
+retry:
+        isize = ip->i_d.di_size;
+        if (io->io_new_size > isize) {
+                isize = io->io_new_size;
+        }
+        aeof = 0;
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+        /*
+         * If the caller is doing a write at the end of the file,
+         * then extend the allocation (and the buffer used for the write)
+         * out to the file system's write iosize.  We clean up any extra
+         * space left over when the file is closed in xfs_inactive().
+         *
+         * For sync writes, we are flushing delayed allocate space to
+         * try to make additional space available for allocation near
+         * the filesystem full boundary - preallocation hurts in that
+         * situation, of course.
+         */
+        if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) {
+                xfs_off_t       aligned_offset;
+                xfs_filblks_t   count_fsb;
+                unsigned int    iosize;
+                xfs_fileoff_t   ioalign;
+                int             n;
+                xfs_fileoff_t   start_fsb;
+                /*
+                 * If there are any real blocks past eof, then don't
+                 * do any speculative allocation.
+                 */
+                start_fsb = XFS_B_TO_FSBT(mp,
+                                        ((xfs_ufsize_t)(offset + count - 1)));
+                count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+                while (count_fsb > 0) {
+                        nimaps = XFS_WRITE_IMAPS;
+                        error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
+                                        0, &firstblock, 0, imap, &nimaps, NULL);
+                        if (error) {
+                                return error;
+                        }
+                        for (n = 0; n < nimaps; n++) {
+                                if ( !(io->io_flags & XFS_IOCORE_RT)  && 
+                                        !imap[n].br_startblock) {
+                                        cmn_err(CE_PANIC,"Access to block "
+                                                "zero:  fs <%s> inode: %lld "
+                                                "start_block : %llx start_off "
+                                                ": %llx blkcnt : %llx "
+                                                "extent-state : %x \n",
+                                                (ip->i_mount)->m_fsname,
+                                                (long long)ip->i_ino,
+                                                imap[n].br_startblock,
+                                                imap[n].br_startoff,
+                                                imap[n].br_blockcount,
+                                                imap[n].br_state);
+                                }
+                                if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+                                    (imap[n].br_startblock != DELAYSTARTBLOCK)) {
+                                        goto write_map;
+                                }
+                                start_fsb += imap[n].br_blockcount;
+                                count_fsb -= imap[n].br_blockcount;
+                        }
+                }
+                iosize = mp->m_writeio_blocks;
+                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+                last_fsb = ioalign + iosize;
+                aeof = 1;
+        }
+write_map:
+        nimaps = XFS_WRITE_IMAPS;
+        firstblock = NULLFSBLOCK;
+        /*
+         * If mounted with the "-o swalloc" option, roundup the allocation
+         * request to a stripe width boundary if the file size is >=
+         * stripe width and we are allocating past the allocation eof.
+         */
+        if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_swidth 
+            && (mp->m_flags & XFS_MOUNT_SWALLOC)
+            && (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)) && aeof) {
+                int eof;
+                xfs_fileoff_t new_last_fsb;
+                new_last_fsb = roundup_64(last_fsb, mp->m_swidth);
+                error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+                if (error) {
+                        return error;
+                }
+                if (eof) {
+                        last_fsb = new_last_fsb;
+                }
+        /*
+         * Roundup the allocation request to a stripe unit (m_dalign) boundary
+         * if the file size is >= stripe unit size, and we are allocating past
+         * the allocation eof.
+         */
+        } else if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_dalign &&
+                   (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)) && aeof) {
+                int eof;
+                xfs_fileoff_t new_last_fsb;
+                new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
+                error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
+                if (error) {
+                        return error;
+                }
+                if (eof) {
+                        last_fsb = new_last_fsb;
+                }
+        /*
+         * Round up the allocation request to a real-time extent boundary
+         * if the file is on the real-time subvolume.
+         */
+        } else if (io->io_flags & XFS_IOCORE_RT && aeof) {
+                int eof;
+                xfs_fileoff_t new_last_fsb;
+                new_last_fsb = roundup_64(last_fsb, mp->m_sb.sb_rextsize);
+                error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
+                if (error) {
+                        return error;
+                }
+                if (eof)
+                        last_fsb = new_last_fsb;
+        }
+        error = xfs_bmapi(NULL, ip, offset_fsb,
+                          (xfs_filblks_t)(last_fsb - offset_fsb),
+                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
+                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
+                          &nimaps, NULL);
+        /*
+         * This can be EDQUOT, if nimaps == 0
+         */
+        if (error && (error != ENOSPC)) {
+                return XFS_ERROR(error);
+        }
+        /*
+         * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+         * then we must have run out of space.
+         */
+        if (nimaps == 0) {
+                xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
+                                        io, offset, count);
+                if (xfs_flush_space(ip, &fsynced, &ioflag))
+                        return XFS_ERROR(ENOSPC);
+                error = 0;
+                goto retry;
+        }
+        *ret_imap = imap[0];
+        *nmaps = 1;
+        if ( !(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
+                cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
+                        "start_block : %llx start_off : %llx blkcnt : %llx "
+                        "extent-state : %x \n",
+                        (ip->i_mount)->m_fsname,
+                        (long long)ip->i_ino,
+                        ret_imap->br_startblock, ret_imap->br_startoff,
+                        ret_imap->br_blockcount,ret_imap->br_state);
+        }
+        return 0;
+}
+/*
+ * Pass in a delayed allocate extent, convert it to real extents;
+ * return to the caller the extent we create which maps on top of
+ * the originating callers request.
+ *
+ * Called without a lock on the inode.
+ */
+int
+xfs_iomap_write_allocate(
+        xfs_inode_t     *ip,
+        xfs_bmbt_irec_t *map,
+        int             *retmap)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_iocore_t    *io = &ip->i_iocore;
+        xfs_fileoff_t   offset_fsb, last_block;
+        xfs_fileoff_t   end_fsb, map_start_fsb;
+        xfs_fsblock_t   first_block;
+        xfs_bmap_free_t free_list;
+        xfs_filblks_t   count_fsb;
+        xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS];
+        xfs_trans_t     *tp;
+        int             i, nimaps, committed;
+        int             error = 0;
+        int             nres;
+        *retmap = 0;
+        /*
+         * Make sure that the dquots are there.
+         */
+        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+                return XFS_ERROR(error);
+        offset_fsb = map->br_startoff;
+        count_fsb = map->br_blockcount;
+        map_start_fsb = offset_fsb;
+        XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+        while (count_fsb != 0) {
+                /*
+                 * Set up a transaction with which to allocate the
+                 * backing store for the file.  Do allocations in a
+                 * loop until we get some space in the range we are
+                 * interested in.  The other space that might be allocated
+                 * is in the delayed allocation extent on which we sit
+                 * but before our buffer starts.
+                 */
+                nimaps = 0;
+                while (nimaps == 0) {
+                        tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+                        nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+                        error = xfs_trans_reserve(tp, nres,
+                                        XFS_WRITE_LOG_RES(mp),
+                                        0, XFS_TRANS_PERM_LOG_RES,
+                                        XFS_WRITE_LOG_COUNT);
+                        if (error == ENOSPC) {
+                                error = xfs_trans_reserve(tp, 0,
+                                                XFS_WRITE_LOG_RES(mp),
+                                                0,
+                                                XFS_TRANS_PERM_LOG_RES,
+                                                XFS_WRITE_LOG_COUNT);
+                        }
+                        if (error) {
+                                xfs_trans_cancel(tp, 0);
+                                return XFS_ERROR(error);
+                        }
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                        xfs_trans_ihold(tp, ip);
+                        XFS_BMAP_INIT(&free_list, &first_block);
+                        nimaps = XFS_STRAT_WRITE_IMAPS;
+                        /*
+                         * Ensure we don't go beyond eof - it is possible
+                         * the extents changed since we did the read call,
+                         * we dropped the ilock in the interim.
+                         */
+                        end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size);
+                        xfs_bmap_last_offset(NULL, ip, &last_block,
+                                XFS_DATA_FORK);
+                        last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+                        if ((map_start_fsb + count_fsb) > last_block) {
+                                count_fsb = last_block - map_start_fsb;
+                                if (count_fsb == 0) {
+                                        error = EAGAIN;
+                                        goto trans_cancel;
+                                }
+                        }
+                        /* Go get the actual blocks */
+                        error = xfs_bmapi(tp, ip, map_start_fsb, count_fsb,
+                                        XFS_BMAPI_WRITE, &first_block, 1,
+                                        imap, &nimaps, &free_list);
+                        if (error)
+                                goto trans_cancel;
+                        error = xfs_bmap_finish(&tp, &free_list,
+                                        first_block, &committed);
+                        if (error)
+                                goto trans_cancel;
+                        error = xfs_trans_commit(tp,
+                                        XFS_TRANS_RELEASE_LOG_RES, NULL);
+                        if (error)
+                                goto error0;
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+                /*
+                 * See if we were able to allocate an extent that
+                 * covers at least part of the callers request
+                 */
+                for (i = 0; i < nimaps; i++) {
+                        if ( !(io->io_flags & XFS_IOCORE_RT)  && 
+                                !imap[i].br_startblock) {
+                                cmn_err(CE_PANIC,"Access to block zero:  "
+                                        "fs <%s> inode: %lld "
+                                        "start_block : %llx start_off : %llx " 
+                                        "blkcnt : %llx extent-state : %x \n",
+                                        (ip->i_mount)->m_fsname,
+                                        (long long)ip->i_ino,
+                                        imap[i].br_startblock,
+                                        imap[i].br_startoff,
+                                        imap[i].br_blockcount,imap[i].br_state);
+                        }
+                        if ((map->br_startoff >= imap[i].br_startoff) &&
+                            (map->br_startoff < (imap[i].br_startoff +
+                                                 imap[i].br_blockcount))) {
+                                *map = imap[i];
+                                *retmap = 1;
+                                XFS_STATS_INC(xs_xstrat_quick);
+                                return 0;
+                        }
+                        count_fsb -= imap[i].br_blockcount;
+                }
+                /* So far we have not mapped the requested part of the
+                 * file, just surrounding data, try again.
+                 */
+                nimaps--;
+                offset_fsb = imap[nimaps].br_startoff +
+                             imap[nimaps].br_blockcount;
+                map_start_fsb = offset_fsb;
+        }
+trans_cancel:
+        xfs_bmap_cancel(&free_list);
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error0:
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return XFS_ERROR(error);
+}
+int
+xfs_iomap_write_unwritten(
+        xfs_inode_t     *ip,
+        loff_t          offset,
+        size_t          count)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_iocore_t    *io = &ip->i_iocore;
+        xfs_trans_t     *tp;
+        xfs_fileoff_t   offset_fsb;
+        xfs_filblks_t   count_fsb;
+        xfs_filblks_t   numblks_fsb;
+        xfs_bmbt_irec_t imap;
+        int             committed;
+        int             error;
+        int             nres;
+        int             nimaps;
+        xfs_fsblock_t   firstfsb;
+        xfs_bmap_free_t free_list;
+        xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN,
+                                &ip->i_iocore, offset, count);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+        count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+        do {
+                nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+                /*
+                 * set up a transaction to convert the range of extents
+                 * from unwritten to real. Do allocations in a loop until
+                 * we have covered the range passed in.
+                 */
+                tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+                error = xfs_trans_reserve(tp, nres,
+                                XFS_WRITE_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES,
+                                XFS_WRITE_LOG_COUNT);
+                if (error) {
+                        xfs_trans_cancel(tp, 0);
+                        goto error0;
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                /*
+                 * Modify the unwritten extent state of the buffer.
+                 */
+                XFS_BMAP_INIT(&free_list, &firstfsb);
+                nimaps = 1;
+                error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
+                                  XFS_BMAPI_WRITE, &firstfsb,
+                                  1, &imap, &nimaps, &free_list);
+                if (error)
+                        goto error_on_bmapi_transaction;
+                error = xfs_bmap_finish(&(tp), &(free_list),
+                                firstfsb, &committed);
+                if (error)
+                        goto error_on_bmapi_transaction;
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error)
+                        goto error0;
+                
+                if ( !(io->io_flags & XFS_IOCORE_RT)  && !imap.br_startblock) {
+                        cmn_err(CE_PANIC,"Access to block zero:  fs <%s> "
+                                "inode: %lld start_block : %llx start_off : "
+                                "%llx blkcnt : %llx extent-state : %x \n",
+                                (ip->i_mount)->m_fsname,
+                                (long long)ip->i_ino,
+                                imap.br_startblock,imap.br_startoff,
+                                imap.br_blockcount,imap.br_state);
+                }
+                if ((numblks_fsb = imap.br_blockcount) == 0) {
+                        /*
+                         * The numblks_fsb value should always get
+                         * smaller, otherwise the loop is stuck.
+                         */
+                        ASSERT(imap.br_blockcount);
+                        break;
+                }
+                offset_fsb += numblks_fsb;
+                count_fsb -= numblks_fsb;
+        } while (count_fsb > 0);
+        return 0;
+error_on_bmapi_transaction:
+        xfs_bmap_cancel(&free_list);
+        xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+        return XFS_ERROR(error);
+}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
new file mode 100644
index 000000000000..31c91087cb33
--- /dev/null
+++ b/fs/xfs/xfs_iomap.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2003,2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IOMAP_H__
+#define __XFS_IOMAP_H__
+#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL))
+typedef enum {                          /* iomap_flags values */
+        IOMAP_EOF =             0x01,   /* mapping contains EOF   */
+        IOMAP_HOLE =            0x02,   /* mapping covers a hole  */
+        IOMAP_DELAY =           0x04,   /* mapping covers delalloc region  */
+        IOMAP_REALTIME =        0x10,   /* mapping on the realtime device  */
+        IOMAP_UNWRITTEN =       0x20,   /* mapping covers allocated */
+                                        /* but uninitialized file data  */
+        IOMAP_NEW =             0x40    /* just allocate */
+} iomap_flags_t;
+typedef enum {
+        /* base extent manipulation calls */
+        BMAPI_READ = (1 << 0),          /* read extents */
+        BMAPI_WRITE = (1 << 1),         /* create extents */
+        BMAPI_ALLOCATE = (1 << 2),      /* delayed allocate to real extents */
+        BMAPI_UNWRITTEN  = (1 << 3),    /* unwritten extents to real extents */
+        /* modifiers */
+        BMAPI_IGNSTATE = (1 << 4),      /* ignore unwritten state on read */
+        BMAPI_DIRECT = (1 << 5),                /* direct instead of buffered write */
+        BMAPI_MMAP = (1 << 6),          /* allocate for mmap write */
+        BMAPI_SYNC = (1 << 7),          /* sync write to flush delalloc space */
+        BMAPI_TRYLOCK = (1 << 8),       /* non-blocking request */
+        BMAPI_DEVICE = (1 << 9),        /* we only want to know the device */
+} bmapi_flags_t;
+/*
+ * xfs_iomap_t:  File system I/O map
+ *
+ * The iomap_bn field is expressed in 512-byte blocks, and is where the 
+ * mapping starts on disk.
+ *
+ * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes.
+ * iomap_offset is the offset of the mapping in the file itself.
+ * iomap_bsize is the size of the mapping,  iomap_delta is the 
+ * desired data's offset into the mapping, given the offset supplied 
+ * to the file I/O map routine.
+ *
+ * When a request is made to read beyond the logical end of the object,
+ * iomap_size may be set to 0, but iomap_offset and iomap_length should be set
+ * to the actual amount of underlying storage that has been allocated, if any.
+ */
+typedef struct xfs_iomap {
+        xfs_daddr_t             iomap_bn;       /* first 512b blk of mapping */
+        xfs_buftarg_t           *iomap_target;
+        loff_t                  iomap_offset;   /* offset of mapping, bytes */
+        loff_t                  iomap_bsize;    /* size of mapping, bytes */
+        size_t                  iomap_delta;    /* offset into mapping, bytes */
+        iomap_flags_t           iomap_flags;
+} xfs_iomap_t;
+struct xfs_iocore;
+struct xfs_inode;
+struct xfs_bmbt_irec;
+extern int xfs_iomap(struct xfs_iocore *, xfs_off_t, ssize_t, int,
+                     struct xfs_iomap *, int *);
+extern int xfs_iomap_write_direct(struct xfs_inode *, loff_t, size_t,
+                                  int, struct xfs_bmbt_irec *, int *, int);
+extern int xfs_iomap_write_delay(struct xfs_inode *, loff_t, size_t, int,
+                                 struct xfs_bmbt_irec *, int *);
+extern int xfs_iomap_write_allocate(struct xfs_inode *,
+                                struct xfs_bmbt_irec *, int *);
+extern int xfs_iomap_write_unwritten(struct xfs_inode *, loff_t, size_t);
+#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
new file mode 100644
index 000000000000..8fbc8d378188
--- /dev/null
+++ b/fs/xfs/xfs_itable.c
@@ -0,0 +1,858 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#ifndef HAVE_USERACC
+#define useracc(ubuffer, size, flags, foo) (0)
+#define unuseracc(ubuffer, size, flags)
+#endif
+STATIC int
+xfs_bulkstat_one_iget(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        xfs_bstat_t     *buf,           /* return buffer */
+        int             *stat)          /* BULKSTAT_RV_... */
+{
+        xfs_dinode_core_t *dic;         /* dinode core info pointer */
+        xfs_inode_t     *ip;            /* incore inode pointer */
+        int             error;
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
+        if (error) {
+                *stat = BULKSTAT_RV_NOTHING;
+                return error;
+        }
+        ASSERT(ip != NULL);
+        ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+        if (ip->i_d.di_mode == 0) {
+                *stat = BULKSTAT_RV_NOTHING;
+                error = XFS_ERROR(ENOENT);
+                goto out_iput;
+        }
+        dic = &ip->i_d;
+        /* xfs_iget returns the following without needing
+         * further change.
+         */
+        buf->bs_nlink = dic->di_nlink;
+        buf->bs_projid = dic->di_projid;
+        buf->bs_ino = ino;
+        buf->bs_mode = dic->di_mode;
+        buf->bs_uid = dic->di_uid;
+        buf->bs_gid = dic->di_gid;
+        buf->bs_size = dic->di_size;
+        buf->bs_atime.tv_sec = dic->di_atime.t_sec;
+        buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
+        buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
+        buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
+        buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
+        buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+        buf->bs_xflags = xfs_ip2xflags(ip);
+        buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+        buf->bs_extents = dic->di_nextents;
+        buf->bs_gen = dic->di_gen;
+        memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
+        buf->bs_dmevmask = dic->di_dmevmask;
+        buf->bs_dmstate = dic->di_dmstate;
+        buf->bs_aextents = dic->di_anextents;
+        switch (dic->di_format) {
+        case XFS_DINODE_FMT_DEV:
+                buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+                buf->bs_blksize = BLKDEV_IOSIZE;
+                buf->bs_blocks = 0;
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+        case XFS_DINODE_FMT_UUID:
+                buf->bs_rdev = 0;
+                buf->bs_blksize = mp->m_sb.sb_blocksize;
+                buf->bs_blocks = 0;
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+        case XFS_DINODE_FMT_BTREE:
+                buf->bs_rdev = 0;
+                buf->bs_blksize = mp->m_sb.sb_blocksize;
+                buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks;
+                break;
+        }
+ out_iput:
+        xfs_iput(ip, XFS_ILOCK_SHARED);
+        return error;
+}
+STATIC int
+xfs_bulkstat_one_dinode(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        xfs_dinode_t    *dip,           /* dinode inode pointer */
+        xfs_bstat_t     *buf)           /* return buffer */
+{
+        xfs_dinode_core_t *dic;         /* dinode core info pointer */
+        dic = &dip->di_core;
+        /*
+         * The inode format changed when we moved the link count and
+         * made it 32 bits long.  If this is an old format inode,
+         * convert it in memory to look like a new one.  If it gets
+         * flushed to disk we will convert back before flushing or
+         * logging it.  We zero out the new projid field and the old link
+         * count field.  We'll handle clearing the pad field (the remains
+         * of the old uuid field) when we actually convert the inode to
+         * the new format. We don't change the version number so that we
+         * can distinguish this from a real new format inode.
+         */
+        if (INT_GET(dic->di_version, ARCH_CONVERT) == XFS_DINODE_VERSION_1) {
+                buf->bs_nlink = INT_GET(dic->di_onlink, ARCH_CONVERT);
+                buf->bs_projid = 0;
+        } else {
+                buf->bs_nlink = INT_GET(dic->di_nlink, ARCH_CONVERT);
+                buf->bs_projid = INT_GET(dic->di_projid, ARCH_CONVERT);
+        }
+        buf->bs_ino = ino;
+        buf->bs_mode = INT_GET(dic->di_mode, ARCH_CONVERT);
+        buf->bs_uid = INT_GET(dic->di_uid, ARCH_CONVERT);
+        buf->bs_gid = INT_GET(dic->di_gid, ARCH_CONVERT);
+        buf->bs_size = INT_GET(dic->di_size, ARCH_CONVERT);
+        buf->bs_atime.tv_sec = INT_GET(dic->di_atime.t_sec, ARCH_CONVERT);
+        buf->bs_atime.tv_nsec = INT_GET(dic->di_atime.t_nsec, ARCH_CONVERT);
+        buf->bs_mtime.tv_sec = INT_GET(dic->di_mtime.t_sec, ARCH_CONVERT);
+        buf->bs_mtime.tv_nsec = INT_GET(dic->di_mtime.t_nsec, ARCH_CONVERT);
+        buf->bs_ctime.tv_sec = INT_GET(dic->di_ctime.t_sec, ARCH_CONVERT);
+        buf->bs_ctime.tv_nsec = INT_GET(dic->di_ctime.t_nsec, ARCH_CONVERT);
+        buf->bs_xflags = xfs_dic2xflags(dic);
+        buf->bs_extsize = INT_GET(dic->di_extsize, ARCH_CONVERT) << mp->m_sb.sb_blocklog;
+        buf->bs_extents = INT_GET(dic->di_nextents, ARCH_CONVERT);
+        buf->bs_gen = INT_GET(dic->di_gen, ARCH_CONVERT);
+        memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
+        buf->bs_dmevmask = INT_GET(dic->di_dmevmask, ARCH_CONVERT);
+        buf->bs_dmstate = INT_GET(dic->di_dmstate, ARCH_CONVERT);
+        buf->bs_aextents = INT_GET(dic->di_anextents, ARCH_CONVERT);
+        switch (INT_GET(dic->di_format, ARCH_CONVERT)) {
+        case XFS_DINODE_FMT_DEV:
+                buf->bs_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
+                buf->bs_blksize = BLKDEV_IOSIZE;
+                buf->bs_blocks = 0;
+                break;
+        case XFS_DINODE_FMT_LOCAL:
+        case XFS_DINODE_FMT_UUID:
+                buf->bs_rdev = 0;
+                buf->bs_blksize = mp->m_sb.sb_blocksize;
+                buf->bs_blocks = 0;
+                break;
+        case XFS_DINODE_FMT_EXTENTS:
+        case XFS_DINODE_FMT_BTREE:
+                buf->bs_rdev = 0;
+                buf->bs_blksize = mp->m_sb.sb_blocksize;
+                buf->bs_blocks = INT_GET(dic->di_nblocks, ARCH_CONVERT);
+                break;
+        }
+        return 0;
+}
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int                             /* error status */
+xfs_bulkstat_one(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        void            __user *buffer, /* buffer to place output in */
+        int             ubsize,         /* size of buffer */
+        void            *private_data,  /* my private data */
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        int             *ubused,        /* bytes used by me */
+        void            *dibuff,        /* on-disk inode buffer */
+        int             *stat)          /* BULKSTAT_RV_... */
+{
+        xfs_bstat_t     *buf;           /* return buffer */
+        int             error = 0;      /* error value */
+        xfs_dinode_t    *dip;           /* dinode inode pointer */
+        dip = (xfs_dinode_t *)dibuff;
+        if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+            (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+             (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) {
+                *stat = BULKSTAT_RV_NOTHING;
+                return XFS_ERROR(EINVAL);
+        }
+        if (ubsize < sizeof(*buf)) {
+                *stat = BULKSTAT_RV_NOTHING;
+                return XFS_ERROR(ENOMEM);
+        }
+        buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
+        if (dip == NULL) {
+                /* We're not being passed a pointer to a dinode.  This happens
+                 * if BULKSTAT_FG_IGET is selected.  Do the iget.
+                 */
+                error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat);
+                if (error)
+                        goto out_free;
+        } else {
+                xfs_bulkstat_one_dinode(mp, ino, dip, buf);
+        }
+        if (copy_to_user(buffer, buf, sizeof(*buf)))  {
+                *stat = BULKSTAT_RV_NOTHING;
+                error =  EFAULT;
+                goto out_free;
+        }
+        *stat = BULKSTAT_RV_DIDONE;
+        if (ubused)
+                *ubused = sizeof(*buf);
+ out_free:
+        kmem_free(buf, sizeof(*buf));
+        return error;
+}
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int                                     /* error status */
+xfs_bulkstat(
+        xfs_mount_t             *mp,    /* mount point for filesystem */
+        xfs_ino_t               *lastinop, /* last inode returned */
+        int                     *ubcountp, /* size of buffer/count returned */
+        bulkstat_one_pf         formatter, /* func that'd fill a single buf */
+        void                    *private_data,/* private data for formatter */
+        size_t                  statstruct_size, /* sizeof struct filling */
+        char                    __user *ubuffer, /* buffer with inode stats */
+        int                     flags,  /* defined in xfs_itable.h */
+        int                     *done)  /* 1 if there're more stats to get */
+{
+        xfs_agblock_t           agbno=0;/* allocation group block number */
+        xfs_buf_t               *agbp;  /* agi header buffer */
+        xfs_agi_t               *agi;   /* agi header data */
+        xfs_agino_t             agino;  /* inode # in allocation group */
+        xfs_agnumber_t          agno;   /* allocation group number */
+        xfs_daddr_t             bno;    /* inode cluster start daddr */
+        int                     chunkidx; /* current index into inode chunk */
+        int                     clustidx; /* current index into inode cluster */
+        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
+        int                     end_of_ag; /* set if we've seen the ag end */
+        int                     error;  /* error code */
+        int                     fmterror;/* bulkstat formatter result */
+        __int32_t               gcnt;   /* current btree rec's count */
+        xfs_inofree_t           gfree;  /* current btree rec's free mask */
+        xfs_agino_t             gino;   /* current btree rec's start inode */
+        int                     i;      /* loop index */
+        int                     icount; /* count of inodes good in irbuf */
+        xfs_ino_t               ino;    /* inode number (filesystem) */
+        xfs_inobt_rec_t         *irbp;  /* current irec buffer pointer */
+        xfs_inobt_rec_t         *irbuf; /* start of irec buffer */
+        xfs_inobt_rec_t         *irbufend; /* end of good irec buffer entries */
+        xfs_ino_t               lastino=0; /* last inode number returned */
+        int                     nbcluster; /* # of blocks in a cluster */
+        int                     nicluster; /* # of inodes in a cluster */
+        int                     nimask; /* mask for inode clusters */
+        int                     nirbuf; /* size of irbuf */
+        int                     rval;   /* return value error code */
+        int                     tmp;    /* result value from btree calls */
+        int                     ubcount; /* size of user's buffer */
+        int                     ubleft; /* bytes left in user's buffer */
+        char                    __user *ubufp;  /* pointer into user's buffer */
+        int                     ubelem; /* spaces used in user's buffer */
+        int                     ubused; /* bytes used by formatter */
+        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
+        xfs_dinode_t            *dip;   /* ptr into bp for specific inode */
+        xfs_inode_t             *ip;    /* ptr to in-core inode struct */
+        /*
+         * Get the last inode value, see if there's nothing to do.
+         */
+        ino = (xfs_ino_t)*lastinop;
+        dip = NULL;
+        agno = XFS_INO_TO_AGNO(mp, ino);
+        agino = XFS_INO_TO_AGINO(mp, ino);
+        if (agno >= mp->m_sb.sb_agcount ||
+            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+                *done = 1;
+                *ubcountp = 0;
+                return 0;
+        }
+        ubcount = *ubcountp; /* statstruct's */
+        ubleft = ubcount * statstruct_size; /* bytes */
+        *ubcountp = ubelem = 0;
+        *done = 0;
+        fmterror = 0;
+        ubufp = ubuffer;
+        nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+                mp->m_sb.sb_inopblock :
+                (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+        nimask = ~(nicluster - 1);
+        nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+        /*
+         * Lock down the user's buffer. If a buffer was not sent, as in the case
+         * disk quota code calls here, we skip this.
+         */
+        if (ubuffer &&
+            (error = useracc(ubuffer, ubcount * statstruct_size,
+                        (B_READ|B_PHYS), NULL))) {
+                return error;
+        }
+        /*
+         * Allocate a page-sized buffer for inode btree records.
+         * We could try allocating something smaller, but for normal
+         * calls we'll always (potentially) need the whole page.
+         */
+        irbuf = kmem_alloc(NBPC, KM_SLEEP);
+        nirbuf = NBPC / sizeof(*irbuf);
+        /*
+         * Loop over the allocation groups, starting from the last
+         * inode returned; 0 means start of the allocation group.
+         */
+        rval = 0;
+        while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) {
+                bp = NULL;
+                down_read(&mp->m_peraglock);
+                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+                up_read(&mp->m_peraglock);
+                if (error) {
+                        /*
+                         * Skip this allocation group and go to the next one.
+                         */
+                        agno++;
+                        agino = 0;
+                        continue;
+                }
+                agi = XFS_BUF_TO_AGI(agbp);
+                /*
+                 * Allocate and initialize a btree cursor for ialloc btree.
+                 */
+                cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
+                        (xfs_inode_t *)0, 0);
+                irbp = irbuf;
+                irbufend = irbuf + nirbuf;
+                end_of_ag = 0;
+                /*
+                 * If we're returning in the middle of an allocation group,
+                 * we need to get the remainder of the chunk we're in.
+                 */
+                if (agino > 0) {
+                        /*
+                         * Lookup the inode chunk that this inode lives in.
+                         */
+                        error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+                        if (!error &&   /* no I/O error */
+                            tmp &&      /* lookup succeeded */
+                                        /* got the record, should always work */
+                            !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+                                    &gfree, &i)) &&
+                            i == 1 &&
+                                        /* this is the right chunk */
+                            agino < gino + XFS_INODES_PER_CHUNK &&
+                                        /* lastino was not last in chunk */
+                            (chunkidx = agino - gino + 1) <
+                                    XFS_INODES_PER_CHUNK &&
+                                        /* there are some left allocated */
+                            XFS_INOBT_MASKN(chunkidx,
+                                    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+                                /*
+                                 * Grab the chunk record.  Mark all the
+                                 * uninteresting inodes (because they're
+                                 * before our start point) free.
+                                 */
+                                for (i = 0; i < chunkidx; i++) {
+                                        if (XFS_INOBT_MASK(i) & ~gfree)
+                                                gcnt++;
+                                }
+                                gfree |= XFS_INOBT_MASKN(0, chunkidx);
+                                INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+                                INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+                                INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+                                irbp++;
+                                agino = gino + XFS_INODES_PER_CHUNK;
+                                icount = XFS_INODES_PER_CHUNK - gcnt;
+                        } else {
+                                /*
+                                 * If any of those tests failed, bump the
+                                 * inode number (just in case).
+                                 */
+                                agino++;
+                                icount = 0;
+                        }
+                        /*
+                         * In any case, increment to the next record.
+                         */
+                        if (!error)
+                                error = xfs_inobt_increment(cur, 0, &tmp);
+                } else {
+                        /*
+                         * Start of ag.  Lookup the first inode chunk.
+                         */
+                        error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+                        icount = 0;
+                }
+                /*
+                 * Loop through inode btree records in this ag,
+                 * until we run out of inodes or space in the buffer.
+                 */
+                while (irbp < irbufend && icount < ubcount) {
+                        /*
+                         * Loop as long as we're unable to read the
+                         * inode btree.
+                         */
+                        while (error) {
+                                agino += XFS_INODES_PER_CHUNK;
+                                if (XFS_AGINO_TO_AGBNO(mp, agino) >=
+                                                INT_GET(agi->agi_length, ARCH_CONVERT))
+                                        break;
+                                error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
+                                                            &tmp);
+                        }
+                        /*
+                         * If ran off the end of the ag either with an error,
+                         * or the normal way, set end and stop collecting.
+                         */
+                        if (error ||
+                            (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+                                    &gfree, &i)) ||
+                            i == 0) {
+                                end_of_ag = 1;
+                                break;
+                        }
+                        /*
+                         * If this chunk has any allocated inodes, save it.
+                         */
+                        if (gcnt < XFS_INODES_PER_CHUNK) {
+                                INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+                                INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+                                INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+                                irbp++;
+                                icount += XFS_INODES_PER_CHUNK - gcnt;
+                        }
+                        /*
+                         * Set agino to after this chunk and bump the cursor.
+                         */
+                        agino = gino + XFS_INODES_PER_CHUNK;
+                        error = xfs_inobt_increment(cur, 0, &tmp);
+                }
+                /*
+                 * Drop the btree buffers and the agi buffer.
+                 * We can't hold any of the locks these represent
+                 * when calling iget.
+                 */
+                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                xfs_buf_relse(agbp);
+                /*
+                 * Now format all the good inodes into the user's buffer.
+                 */
+                irbufend = irbp;
+                for (irbp = irbuf;
+                     irbp < irbufend && ubleft >= statstruct_size; irbp++) {
+                        /*
+                         * Read-ahead the next chunk's worth of inodes.
+                         */
+                        if (&irbp[1] < irbufend) {
+                                /*
+                                 * Loop over all clusters in the next chunk.
+                                 * Do a readahead if there are any allocated
+                                 * inodes in that cluster.
+                                 */
+                                for (agbno = XFS_AGINO_TO_AGBNO(mp,
+                                                        INT_GET(irbp[1].ir_startino, ARCH_CONVERT)),
+                                     chunkidx = 0;
+                                     chunkidx < XFS_INODES_PER_CHUNK;
+                                     chunkidx += nicluster,
+                                     agbno += nbcluster) {
+                                        if (XFS_INOBT_MASKN(chunkidx,
+                                                            nicluster) &
+                                            ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT)))
+                                                xfs_btree_reada_bufs(mp, agno,
+                                                        agbno, nbcluster);
+                                }
+                        }
+                        /*
+                         * Now process this chunk of inodes.
+                         */
+                        for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0;
+                             ubleft > 0 &&
+                                INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK;
+                             chunkidx++, clustidx++, agino++) {
+                                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+                                /*
+                                 * Recompute agbno if this is the
+                                 * first inode of the cluster.
+                                 *
+                                 * Careful with clustidx.   There can be
+                                 * multple clusters per chunk, a single
+                                 * cluster per chunk or a cluster that has
+                                 * inodes represented from several different
+                                 * chunks (if blocksize is large).
+                                 *
+                                 * Because of this, the starting clustidx is
+                                 * initialized to zero in this loop but must
+                                 * later be reset after reading in the cluster
+                                 * buffer.
+                                 */
+                                if ((chunkidx & (nicluster - 1)) == 0) {
+                                        agbno = XFS_AGINO_TO_AGBNO(mp,
+                                                        INT_GET(irbp->ir_startino, ARCH_CONVERT)) +
+                                                ((chunkidx & nimask) >>
+                                                 mp->m_sb.sb_inopblog);
+                                        if (flags & BULKSTAT_FG_QUICK) {
+                                                ino = XFS_AGINO_TO_INO(mp, agno,
+                                                                       agino);
+                                                bno = XFS_AGB_TO_DADDR(mp, agno,
+                                                                       agbno);
+                                                /*
+                                                 * Get the inode cluster buffer
+                                                 */
+                                                ASSERT(xfs_inode_zone != NULL);
+                                                ip = kmem_zone_zalloc(xfs_inode_zone,
+                                                                      KM_SLEEP);
+                                                ip->i_ino = ino;
+                                                ip->i_mount = mp;
+                                                if (bp)
+                                                        xfs_buf_relse(bp);
+                                                error = xfs_itobp(mp, NULL, ip,
+                                                                  &dip, &bp, bno);
+                                                if (!error)
+                                                        clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
+                                                kmem_zone_free(xfs_inode_zone, ip);
+                                                if (XFS_TEST_ERROR(error != 0,
+                                                                   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
+                                                                   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
+                                                        bp = NULL;
+                                                        break;
+                                                }
+                                        }
+                                }
+                                /*
+                                 * Skip if this inode is free.
+                                 */
+                                if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT))
+                                        continue;
+                                /*
+                                 * Count used inodes as free so we can tell
+                                 * when the chunk is used up.
+                                 */
+                                INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1);
+                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
+                                bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+                                if (flags & BULKSTAT_FG_QUICK) {
+                                        dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                                              (clustidx << mp->m_sb.sb_inodelog));
+                                        if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT)
+                                                    != XFS_DINODE_MAGIC
+                                            || !XFS_DINODE_GOOD_VERSION(
+                                                    INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
+                                                continue;
+                                }
+                                /*
+                                 * Get the inode and fill in a single buffer.
+                                 * BULKSTAT_FG_QUICK uses dip to fill it in.
+                                 * BULKSTAT_FG_IGET uses igets.
+                                 * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
+                                 * This is also used to count inodes/blks, etc
+                                 * in xfs_qm_quotacheck.
+                                 */
+                                ubused = statstruct_size;
+                                error = formatter(mp, ino, ubufp,
+                                                ubleft, private_data,
+                                                bno, &ubused, dip, &fmterror);
+                                if (fmterror == BULKSTAT_RV_NOTHING) {
+                                        if (error == ENOMEM)
+                                                ubleft = 0;
+                                        continue;
+                                }
+                                if (fmterror == BULKSTAT_RV_GIVEUP) {
+                                        ubleft = 0;
+                                        ASSERT(error);
+                                        rval = error;
+                                        break;
+                                }
+                                if (ubufp)
+                                        ubufp += ubused;
+                                ubleft -= ubused;
+                                ubelem++;
+                                lastino = ino;
+                        }
+                }
+                if (bp)
+                        xfs_buf_relse(bp);
+                /*
+                 * Set up for the next loop iteration.
+                 */
+                if (ubleft > 0) {
+                        if (end_of_ag) {
+                                agno++;
+                                agino = 0;
+                        } else
+                                agino = XFS_INO_TO_AGINO(mp, lastino);
+                } else
+                        break;
+        }
+        /*
+         * Done, we're either out of filesystem or space to put the data.
+         */
+        kmem_free(irbuf, NBPC);
+        if (ubuffer)
+                unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS));
+        *ubcountp = ubelem;
+        if (agno >= mp->m_sb.sb_agcount) {
+                /*
+                 * If we ran out of filesystem, mark lastino as off
+                 * the end of the filesystem, so the next call
+                 * will return immediately.
+                 */
+                *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+                *done = 1;
+        } else
+                *lastinop = (xfs_ino_t)lastino;
+        return rval;
+}
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ * Special case for non-sequential one inode bulkstat.
+ */
+int                                     /* error status */
+xfs_bulkstat_single(
+        xfs_mount_t             *mp,    /* mount point for filesystem */
+        xfs_ino_t               *lastinop, /* inode to return */
+        char                    __user *buffer, /* buffer with inode stats */
+        int                     *done)  /* 1 if there're more stats to get */
+{
+        int                     count;  /* count value for bulkstat call */
+        int                     error;  /* return value */
+        xfs_ino_t               ino;    /* filesystem inode number */
+        int                     res;    /* result from bs1 */
+        /*
+         * note that requesting valid inode numbers which are not allocated
+         * to inodes will most likely cause xfs_itobp to generate warning
+         * messages about bad magic numbers. This is ok. The fact that
+         * the inode isn't actually an inode is handled by the
+         * error check below. Done this way to make the usual case faster
+         * at the expense of the error case.
+         */
+        ino = (xfs_ino_t)*lastinop;
+        error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
+                                 NULL, 0, NULL, NULL, &res);
+        if (error) {
+                /*
+                 * Special case way failed, do it the "long" way
+                 * to see if that works.
+                 */
+                (*lastinop)--;
+                count = 1;
+                if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
+                                NULL, sizeof(xfs_bstat_t), buffer,
+                                BULKSTAT_FG_IGET, done))
+                        return error;
+                if (count == 0 || (xfs_ino_t)*lastinop != ino)
+                        return error == EFSCORRUPTED ?
+                                XFS_ERROR(EINVAL) : error;
+                else
+                        return 0;
+        }
+        *done = 0;
+        return 0;
+}
+/*
+ * Return inode number table for the filesystem.
+ */
+int                                     /* error status */
+xfs_inumbers(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       *lastino,       /* last inode returned */
+        int             *count,         /* size of buffer/count returned */
+        xfs_inogrp_t    __user *ubuffer)/* buffer with inode descriptions */
+{
+        xfs_buf_t       *agbp;
+        xfs_agino_t     agino;
+        xfs_agnumber_t  agno;
+        int             bcount;
+        xfs_inogrp_t    *buffer;
+        int             bufidx;
+        xfs_btree_cur_t *cur;
+        int             error;
+        __int32_t       gcnt;
+        xfs_inofree_t   gfree;
+        xfs_agino_t     gino;
+        int             i;
+        xfs_ino_t       ino;
+        int             left;
+        int             tmp;
+        ino = (xfs_ino_t)*lastino;
+        agno = XFS_INO_TO_AGNO(mp, ino);
+        agino = XFS_INO_TO_AGINO(mp, ino);
+        left = *count;
+        *count = 0;
+        bcount = MIN(left, (int)(NBPP / sizeof(*buffer)));
+        buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+        error = bufidx = 0;
+        cur = NULL;
+        agbp = NULL;
+        while (left > 0 && agno < mp->m_sb.sb_agcount) {
+                if (agbp == NULL) {
+                        down_read(&mp->m_peraglock);
+                        error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+                        up_read(&mp->m_peraglock);
+                        if (error) {
+                                /*
+                                 * If we can't read the AGI of this ag,
+                                 * then just skip to the next one.
+                                 */
+                                ASSERT(cur == NULL);
+                                agbp = NULL;
+                                agno++;
+                                agino = 0;
+                                continue;
+                        }
+                        cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
+                                XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+                        if (error) {
+                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                                cur = NULL;
+                                xfs_buf_relse(agbp);
+                                agbp = NULL;
+                                /*
+                                 * Move up the the last inode in the current
+                                 * chunk.  The lookup_ge will always get
+                                 * us the first inode in the next chunk.
+                                 */
+                                agino += XFS_INODES_PER_CHUNK - 1;
+                                continue;
+                        }
+                }
+                if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
+                        &i)) ||
+                    i == 0) {
+                        xfs_buf_relse(agbp);
+                        agbp = NULL;
+                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                        cur = NULL;
+                        agno++;
+                        agino = 0;
+                        continue;
+                }
+                agino = gino + XFS_INODES_PER_CHUNK - 1;
+                buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
+                buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
+                buffer[bufidx].xi_allocmask = ~gfree;
+                bufidx++;
+                left--;
+                if (bufidx == bcount) {
+                        if (copy_to_user(ubuffer, buffer,
+                                        bufidx * sizeof(*buffer))) {
+                                error = XFS_ERROR(EFAULT);
+                                break;
+                        }
+                        ubuffer += bufidx;
+                        *count += bufidx;
+                        bufidx = 0;
+                }
+                if (left) {
+                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        if (error) {
+                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                                cur = NULL;
+                                xfs_buf_relse(agbp);
+                                agbp = NULL;
+                                /*
+                                 * The agino value has already been bumped.
+                                 * Just try to skip up to it.
+                                 */
+                                agino += XFS_INODES_PER_CHUNK;
+                                continue;
+                        }
+                }
+        }
+        if (!error) {
+                if (bufidx) {
+                        if (copy_to_user(ubuffer, buffer,
+                                        bufidx * sizeof(*buffer)))
+                                error = XFS_ERROR(EFAULT);
+                        else
+                                *count += bufidx;
+                }
+                *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
+        }
+        kmem_free(buffer, bcount * sizeof(*buffer));
+        if (cur)
+                xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
+                                           XFS_BTREE_NOERROR));
+        if (agbp)
+                xfs_buf_relse(agbp);
+        return error;
+}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
new file mode 100644
index 000000000000..2be9d1805ab2
--- /dev/null
+++ b/fs/xfs/xfs_itable.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ITABLE_H__
+#define __XFS_ITABLE_H__
+/*
+ * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
+ * structures (by the dmi library). This is a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
+ */
+typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
+                               xfs_ino_t        ino,
+                               void             __user *buffer,
+                               int              ubsize,
+                               void             *private_data,
+                               xfs_daddr_t      bno,
+                               int              *ubused,
+                               void             *dip,
+                               int              *stat);
+/*
+ * Values for stat return value.
+ */
+#define BULKSTAT_RV_NOTHING     0
+#define BULKSTAT_RV_DIDONE      1
+#define BULKSTAT_RV_GIVEUP      2
+/*
+ * Values for bulkstat flag argument.
+ */
+#define BULKSTAT_FG_IGET        0x1     /* Go through the buffer cache */
+#define BULKSTAT_FG_QUICK       0x2     /* No iget, walk the dinode cluster */
+#define BULKSTAT_FG_VFSLOCKED   0x4     /* Already have vfs lock */
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int                                     /* error status */
+xfs_bulkstat(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       *lastino,       /* last inode returned */
+        int             *count,         /* size of buffer/count returned */
+        bulkstat_one_pf formatter,      /* func that'd fill a single buf */
+        void            *private_data,  /* private data for formatter */
+        size_t          statstruct_size,/* sizeof struct that we're filling */
+        char            __user *ubuffer,/* buffer with inode stats */
+        int             flags,          /* flag to control access method */
+        int             *done);         /* 1 if there're more stats to get */
+int
+xfs_bulkstat_single(
+        xfs_mount_t             *mp,
+        xfs_ino_t               *lastinop,
+        char                    __user *buffer,
+        int                     *done);
+int
+xfs_bulkstat_one(
+        xfs_mount_t             *mp,
+        xfs_ino_t               ino,
+        void                    __user *buffer,
+        int                     ubsize,
+        void                    *private_data,
+        xfs_daddr_t             bno,
+        int                     *ubused,
+        void                    *dibuff,
+        int                     *stat);
+int                                     /* error status */
+xfs_inumbers(
+        xfs_mount_t             *mp,    /* mount point for filesystem */
+        xfs_ino_t               *last,  /* last inode returned */
+        int                     *count, /* size of buffer/count returned */
+        xfs_inogrp_t            __user *buffer);/* buffer with inode info */
+#endif  /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
new file mode 100644
index 000000000000..092d5fb096b1
--- /dev/null
+++ b/fs/xfs/xfs_log.c
@@ -0,0 +1,3560 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * High level interface routines for log manager
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_log_recover.h"
+#include "xfs_bit.h"
+#include "xfs_rw.h"
+#include "xfs_trans_priv.h"
+#define xlog_write_adv_cnt(ptr, len, off, bytes) \
+        { (ptr) += (bytes); \
+          (len) -= (bytes); \
+          (off) += (bytes);}
+/* Local miscellaneous function prototypes */
+STATIC int       xlog_bdstrat_cb(struct xfs_buf *);
+STATIC int       xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+                                    xlog_in_core_t **, xfs_lsn_t *);
+STATIC xlog_t *  xlog_alloc_log(xfs_mount_t     *mp,
+                                xfs_buftarg_t   *log_target,
+                                xfs_daddr_t     blk_offset,
+                                int             num_bblks);
+STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
+STATIC void      xlog_unalloc_log(xlog_t *log);
+STATIC int       xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
+                            int nentries, xfs_log_ticket_t tic,
+                            xfs_lsn_t *start_lsn,
+                            xlog_in_core_t **commit_iclog,
+                            uint flags);
+/* local state machine functions */
+STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
+STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
+STATIC int  xlog_state_get_iclog_space(xlog_t           *log,
+                                       int              len,
+                                       xlog_in_core_t   **iclog,
+                                       xlog_ticket_t    *ticket,
+                                       int              *continued_write,
+                                       int              *logoffsetp);
+STATIC void xlog_state_put_ticket(xlog_t        *log,
+                                  xlog_ticket_t *tic);
+STATIC int  xlog_state_release_iclog(xlog_t             *log,
+                                     xlog_in_core_t     *iclog);
+STATIC void xlog_state_switch_iclogs(xlog_t             *log,
+                                     xlog_in_core_t *iclog,
+                                     int                eventual_size);
+STATIC int  xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
+STATIC int  xlog_state_sync_all(xlog_t *log, uint flags);
+STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
+/* local functions to manipulate grant head */
+STATIC int  xlog_grant_log_space(xlog_t         *log,
+                                 xlog_ticket_t  *xtic);
+STATIC void xlog_grant_push_ail(xfs_mount_t     *mp,
+                                int             need_bytes);
+STATIC void xlog_regrant_reserve_log_space(xlog_t        *log,
+                                           xlog_ticket_t *ticket);
+STATIC int xlog_regrant_write_log_space(xlog_t          *log,
+                                         xlog_ticket_t  *ticket);
+STATIC void xlog_ungrant_log_space(xlog_t        *log,
+                                   xlog_ticket_t *ticket);
+/* local ticket functions */
+STATIC void             xlog_state_ticket_alloc(xlog_t *log);
+STATIC xlog_ticket_t    *xlog_ticket_get(xlog_t *log,
+                                         int    unit_bytes,
+                                         int    count,
+                                         char   clientid,
+                                         uint   flags);
+STATIC void             xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
+/* local debug functions */
+#if defined(DEBUG) && !defined(XLOG_NOLOG)
+STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void     xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
+                                  int count, boolean_t syncing);
+STATIC void     xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
+                                     xfs_lsn_t tail_lsn);
+#else
+#define xlog_verify_dest_ptr(a,b)
+#define xlog_verify_grant_head(a,b)
+#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_tail_lsn(a,b,c)
+#endif
+int             xlog_iclogs_empty(xlog_t *log);
+#ifdef DEBUG
+int xlog_do_error = 0;
+int xlog_req_num  = 0;
+int xlog_error_mod = 33;
+#endif
+#define XLOG_FORCED_SHUTDOWN(log)       (log->l_flags & XLOG_IO_ERROR)
+/*
+ * 0 => disable log manager
+ * 1 => enable log manager
+ * 2 => enable log manager and log debugging
+ */
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+int   xlog_debug = 1;
+xfs_buftarg_t *xlog_target;
+#endif
+#if defined(XFS_LOG_TRACE)
+void
+xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
+{
+        if (! log->l_grant_trace) {
+                log->l_grant_trace = ktrace_alloc(1024, KM_NOSLEEP);
+                if (! log->l_grant_trace)
+                        return;
+        }
+        ktrace_enter(log->l_grant_trace,
+                     (void *)tic,
+                     (void *)log->l_reserve_headq,
+                     (void *)log->l_write_headq,
+                     (void *)((unsigned long)log->l_grant_reserve_cycle),
+                     (void *)((unsigned long)log->l_grant_reserve_bytes),
+                     (void *)((unsigned long)log->l_grant_write_cycle),
+                     (void *)((unsigned long)log->l_grant_write_bytes),
+                     (void *)((unsigned long)log->l_curr_cycle),
+                     (void *)((unsigned long)log->l_curr_block),
+                     (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
+                     (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
+                     (void *)string,
+                     (void *)((unsigned long)13),
+                     (void *)((unsigned long)14),
+                     (void *)((unsigned long)15),
+                     (void *)((unsigned long)16));
+}
+void
+xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
+{
+        pid_t pid;
+        pid = current_pid();
+        if (!iclog->ic_trace)
+                iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
+        ktrace_enter(iclog->ic_trace,
+                     (void *)((unsigned long)state),
+                     (void *)((unsigned long)pid),
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0,
+                     (void *)0);
+}
+#else
+#define xlog_trace_loggrant(log,tic,string)
+#define xlog_trace_iclog(iclog,state)
+#endif /* XFS_LOG_TRACE */
+/*
+ * NOTES:
+ *
+ *      1. currblock field gets updated at startup and after in-core logs
+ *              marked as with WANT_SYNC.
+ */
+/*
+ * This routine is called when a user of a log manager ticket is done with
+ * the reservation.  If the ticket was ever used, then a commit record for
+ * the associated transaction is written out as a log operation header with
+ * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
+ * a given ticket.  If the ticket was one with a permanent reservation, then
+ * a few operations are done differently.  Permanent reservation tickets by
+ * default don't release the reservation.  They just commit the current
+ * transaction with the belief that the reservation is still needed.  A flag
+ * must be passed in before permanent reservations are actually released.
+ * When these type of tickets are not released, they need to be set into
+ * the inited state again.  By doing this, a start record will be written
+ * out when the next write occurs.
+ */
+xfs_lsn_t
+xfs_log_done(xfs_mount_t        *mp,
+             xfs_log_ticket_t   xtic,
+             void               **iclog,
+             uint               flags)
+{
+        xlog_t          *log    = mp->m_log;
+        xlog_ticket_t   *ticket = (xfs_log_ticket_t) xtic;
+        xfs_lsn_t       lsn     = 0;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug && xlog_target == log->l_targ)
+                return 0;
+#endif
+        if (XLOG_FORCED_SHUTDOWN(log) ||
+            /*
+             * If nothing was ever written, don't write out commit record.
+             * If we get an error, just continue and give back the log ticket.
+             */
+            (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
+             (xlog_commit_record(mp, ticket,
+                                 (xlog_in_core_t **)iclog, &lsn)))) {
+                lsn = (xfs_lsn_t) -1;
+                if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
+                        flags |= XFS_LOG_REL_PERM_RESERV;
+                }
+        }
+        if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
+            (flags & XFS_LOG_REL_PERM_RESERV)) {
+                /*
+                 * Release ticket if not permanent reservation or a specifc
+                 * request has been made to release a permanent reservation.
+                 */
+                xlog_ungrant_log_space(log, ticket);
+                xlog_state_put_ticket(log, ticket);
+        } else {
+                xlog_regrant_reserve_log_space(log, ticket);
+        }
+        /* If this ticket was a permanent reservation and we aren't
+         * trying to release it, reset the inited flags; so next time
+         * we write, a start record will be written out.
+         */
+        if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
+            (flags & XFS_LOG_REL_PERM_RESERV) == 0)
+                ticket->t_flags |= XLOG_TIC_INITED;
+        return lsn;
+}       /* xfs_log_done */
+/*
+ * Force the in-core log to disk.  If flags == XFS_LOG_SYNC,
+ *      the force is done synchronously.
+ *
+ * Asynchronous forces are implemented by setting the WANT_SYNC
+ * bit in the appropriate in-core log and then returning.
+ *
+ * Synchronous forces are implemented with a semaphore.  All callers
+ * to force a given lsn to disk will wait on a semaphore attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * semaphore.
+ */
+int
+xfs_log_force(xfs_mount_t *mp,
+              xfs_lsn_t   lsn,
+              uint        flags)
+{
+        int     rval;
+        xlog_t *log = mp->m_log;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug && xlog_target == log->l_targ)
+                return 0;
+#endif
+        ASSERT(flags & XFS_LOG_FORCE);
+        XFS_STATS_INC(xs_log_force);
+        if ((log->l_flags & XLOG_IO_ERROR) == 0) {
+                if (lsn == 0)
+                        rval = xlog_state_sync_all(log, flags);
+                else
+                        rval = xlog_state_sync(log, lsn, flags);
+        } else {
+                rval = XFS_ERROR(EIO);
+        }
+        return rval;
+}       /* xfs_log_force */
+/*
+ * Attaches a new iclog I/O completion callback routine during
+ * transaction commit.  If the log is in error state, a non-zero
+ * return code is handed back and the caller is responsible for
+ * executing the callback at an appropriate time.
+ */
+int
+xfs_log_notify(xfs_mount_t        *mp,          /* mount of partition */
+               void               *iclog_hndl,  /* iclog to hang callback off */
+               xfs_log_callback_t *cb)
+{
+        xlog_t *log = mp->m_log;
+        xlog_in_core_t    *iclog = (xlog_in_core_t *)iclog_hndl;
+        int     abortflg, spl;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug && xlog_target == log->l_targ)
+                return 0;
+#endif
+        cb->cb_next = NULL;
+        spl = LOG_LOCK(log);
+        abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
+        if (!abortflg) {
+                ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
+                              (iclog->ic_state == XLOG_STATE_WANT_SYNC));
+                cb->cb_next = NULL;
+                *(iclog->ic_callback_tail) = cb;
+                iclog->ic_callback_tail = &(cb->cb_next);
+        }
+        LOG_UNLOCK(log, spl);
+        return abortflg;
+}       /* xfs_log_notify */
+int
+xfs_log_release_iclog(xfs_mount_t *mp,
+                      void        *iclog_hndl)
+{
+        xlog_t *log = mp->m_log;
+        xlog_in_core_t    *iclog = (xlog_in_core_t *)iclog_hndl;
+        if (xlog_state_release_iclog(log, iclog)) {
+                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+                return(EIO);
+        }
+        return 0;
+}
+/*
+ *  1. Reserve an amount of on-disk log space and return a ticket corresponding
+ *      to the reservation.
+ *  2. Potentially, push buffers at tail of log to disk.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(xfs_mount_t      *mp,
+                int              unit_bytes,
+                int              cnt,
+                xfs_log_ticket_t *ticket,
+                __uint8_t        client,
+                uint             flags)
+{
+        xlog_t          *log = mp->m_log;
+        xlog_ticket_t   *internal_ticket;
+        int             retval;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug && xlog_target == log->l_targ)
+                return 0;
+#endif
+        retval = 0;
+        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+        ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
+        if (XLOG_FORCED_SHUTDOWN(log))
+                return XFS_ERROR(EIO);
+        XFS_STATS_INC(xs_try_logspace);
+        if (*ticket != NULL) {
+                ASSERT(flags & XFS_LOG_PERM_RESERV);
+                internal_ticket = (xlog_ticket_t *)*ticket;
+                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+                retval = xlog_regrant_write_log_space(log, internal_ticket);
+        } else {
+                /* may sleep if need to allocate more tickets */
+                internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+                                                  client, flags);
+                *ticket = internal_ticket;
+                xlog_grant_push_ail(mp,
+                                    (internal_ticket->t_unit_res *
+                                     internal_ticket->t_cnt));
+                retval = xlog_grant_log_space(log, internal_ticket);
+        }
+        return retval;
+}       /* xfs_log_reserve */
+/*
+ * Mount a log filesystem
+ *
+ * mp           - ubiquitous xfs mount point structure
+ * log_target   - buftarg of on-disk log device
+ * blk_offset   - Start block # where block size is 512 bytes (BBSIZE)
+ * num_bblocks  - Number of BBSIZE blocks in on-disk log
+ *
+ * Return error or zero.
+ */
+int
+xfs_log_mount(xfs_mount_t       *mp,
+              xfs_buftarg_t     *log_target,
+              xfs_daddr_t       blk_offset,
+              int               num_bblks)
+{
+        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+        else {
+                cmn_err(CE_NOTE,
+                        "!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+                        mp->m_fsname);
+                ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+        }
+        mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug) {
+                cmn_err(CE_NOTE, "log dev: %s", XFS_BUFTARG_NAME(log_target));
+                return 0;
+        }
+#endif
+        /*
+         * skip log recovery on a norecovery mount.  pretend it all
+         * just worked.
+         */
+        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+                int     error;
+                vfs_t   *vfsp = XFS_MTOVFS(mp);
+                int     readonly = (vfsp->vfs_flag & VFS_RDONLY);
+                if (readonly)
+                        vfsp->vfs_flag &= ~VFS_RDONLY;
+                error = xlog_recover(mp->m_log, readonly);
+                if (readonly)
+                        vfsp->vfs_flag |= VFS_RDONLY;
+                if (error) {
+                        cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
+                        xlog_unalloc_log(mp->m_log);
+                        return error;
+                }
+        }
+        /* Normal transactions can now occur */
+        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+        /* End mounting message in xfs_log_mount_finish */
+        return 0;
+}       /* xfs_log_mount */
+/*
+ * Finish the recovery of the file system.  This is separate from
+ * the xfs_log_mount() call, because it depends on the code in
+ * xfs_mountfs() to read in the root and real-time bitmap inodes
+ * between calling xfs_log_mount() and here.
+ *
+ * mp           - ubiquitous xfs mount point structure
+ */
+int
+xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+{
+        int     error;
+        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+                error = xlog_recover_finish(mp->m_log, mfsi_flags);
+        else {
+                error = 0;
+                ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+        }
+        return error;
+}
+/*
+ * Unmount processing for the log.
+ */
+int
+xfs_log_unmount(xfs_mount_t *mp)
+{
+        int             error;
+        error = xfs_log_unmount_write(mp);
+        xfs_log_unmount_dealloc(mp);
+        return (error);
+}
+/*
+ * Final log writes as part of unmount.
+ *
+ * Mark the filesystem clean as unmount happens.  Note that during relocation
+ * this routine needs to be executed as part of source-bag while the
+ * deallocation must not be done until source-end.
+ */
+/*
+ * Unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now since that particular field isn't
+ * currently architecture converted and "nUmount" is a bit foo.
+ * As far as I know, there weren't any dependencies on the old behaviour.
+ */
+int
+xfs_log_unmount_write(xfs_mount_t *mp)
+{
+        xlog_t           *log = mp->m_log;
+        xlog_in_core_t   *iclog;
+#ifdef DEBUG
+        xlog_in_core_t   *first_iclog;
+#endif
+        xfs_log_iovec_t  reg[1];
+        xfs_log_ticket_t tic = NULL;
+        xfs_lsn_t        lsn;
+        int              error;
+        SPLDECL(s);
+        /* the data section must be 32 bit size aligned */
+        struct {
+            __uint16_t magic;
+            __uint16_t pad1;
+            __uint32_t pad2; /* may as well make it 64 bits */
+        } magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug && xlog_target == log->l_targ)
+                return 0;
+#endif
+        /*
+         * Don't write out unmount record on read-only mounts.
+         * Or, if we are doing a forced umount (typically because of IO errors).
+         */
+        if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+                return 0;
+        xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+#ifdef DEBUG
+        first_iclog = iclog = log->l_iclog;
+        do {
+                if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+                        ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
+                        ASSERT(iclog->ic_offset == 0);
+                }
+                iclog = iclog->ic_next;
+        } while (iclog != first_iclog);
+#endif
+        if (! (XLOG_FORCED_SHUTDOWN(log))) {
+                reg[0].i_addr = (void*)&magic;
+                reg[0].i_len  = sizeof(magic);
+                error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+                if (!error) {
+                        /* remove inited flag */
+                        ((xlog_ticket_t *)tic)->t_flags = 0;
+                        error = xlog_write(mp, reg, 1, tic, &lsn,
+                                           NULL, XLOG_UNMOUNT_TRANS);
+                        /*
+                         * At this point, we're umounting anyway,
+                         * so there's no point in transitioning log state
+                         * to IOERROR. Just continue...
+                         */
+                }
+                if (error) {
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                "xfs_log_unmount: unmount record failed");
+                }
+                s = LOG_LOCK(log);
+                iclog = log->l_iclog;
+                iclog->ic_refcnt++;
+                LOG_UNLOCK(log, s);
+                xlog_state_want_sync(log, iclog);
+                (void) xlog_state_release_iclog(log, iclog);
+                s = LOG_LOCK(log);
+                if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
+                      iclog->ic_state == XLOG_STATE_DIRTY)) {
+                        if (!XLOG_FORCED_SHUTDOWN(log)) {
+                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                        &log->l_icloglock, s);
+                        } else {
+                                LOG_UNLOCK(log, s);
+                        }
+                } else {
+                        LOG_UNLOCK(log, s);
+                }
+                if (tic)
+                        xlog_state_put_ticket(log, tic);
+        } else {
+                /*
+                 * We're already in forced_shutdown mode, couldn't
+                 * even attempt to write out the unmount transaction.
+                 *
+                 * Go through the motions of sync'ing and releasing
+                 * the iclog, even though no I/O will actually happen,
+                 * we need to wait for other log I/O's that may already
+                 * be in progress.  Do this as a separate section of
+                 * code so we'll know if we ever get stuck here that
+                 * we're in this odd situation of trying to unmount
+                 * a file system that went into forced_shutdown as
+                 * the result of an unmount..
+                 */
+                s = LOG_LOCK(log);
+                iclog = log->l_iclog;
+                iclog->ic_refcnt++;
+                LOG_UNLOCK(log, s);
+                xlog_state_want_sync(log, iclog);
+                (void) xlog_state_release_iclog(log, iclog);
+                s = LOG_LOCK(log);
+                if ( ! (   iclog->ic_state == XLOG_STATE_ACTIVE
+                        || iclog->ic_state == XLOG_STATE_DIRTY
+                        || iclog->ic_state == XLOG_STATE_IOERROR) ) {
+                                sv_wait(&iclog->ic_forcesema, PMEM,
+                                        &log->l_icloglock, s);
+                } else {
+                        LOG_UNLOCK(log, s);
+                }
+        }
+        return 0;
+}       /* xfs_log_unmount_write */
+/*
+ * Deallocate log structures for unmount/relocation.
+ */
+void
+xfs_log_unmount_dealloc(xfs_mount_t *mp)
+{
+        xlog_unalloc_log(mp->m_log);
+}
+/*
+ * Write region vectors to log.  The write happens using the space reservation
+ * of the ticket (tic).  It is not a requirement that all writes for a given
+ * transaction occur with one call to xfs_log_write().
+ */
+int
+xfs_log_write(xfs_mount_t *     mp,
+              xfs_log_iovec_t   reg[],
+              int               nentries,
+              xfs_log_ticket_t  tic,
+              xfs_lsn_t         *start_lsn)
+{
+        int     error;
+        xlog_t *log = mp->m_log;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug && xlog_target == log->l_targ) {
+                *start_lsn = 0;
+                return 0;
+        }
+#endif
+        if (XLOG_FORCED_SHUTDOWN(log))
+                return XFS_ERROR(EIO);
+        if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) {
+                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+        }
+        return (error);
+}       /* xfs_log_write */
+void
+xfs_log_move_tail(xfs_mount_t   *mp,
+                  xfs_lsn_t     tail_lsn)
+{
+        xlog_ticket_t   *tic;
+        xlog_t          *log = mp->m_log;
+        int             need_bytes, free_bytes, cycle, bytes;
+        SPLDECL(s);
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        if (!xlog_debug && xlog_target == log->l_targ)
+                return;
+#endif
+        /* XXXsup tmp */
+        if (XLOG_FORCED_SHUTDOWN(log))
+                return;
+        ASSERT(!XFS_FORCED_SHUTDOWN(mp));
+        if (tail_lsn == 0) {
+                /* needed since sync_lsn is 64 bits */
+                s = LOG_LOCK(log);
+                tail_lsn = log->l_last_sync_lsn;
+                LOG_UNLOCK(log, s);
+        }
+        s = GRANT_LOCK(log);
+        /* Also an invalid lsn.  1 implies that we aren't passing in a valid
+         * tail_lsn.
+         */
+        if (tail_lsn != 1) {
+                log->l_tail_lsn = tail_lsn;
+        }
+        if ((tic = log->l_write_headq)) {
+#ifdef DEBUG
+                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+                        panic("Recovery problem");
+#endif
+                cycle = log->l_grant_write_cycle;
+                bytes = log->l_grant_write_bytes;
+                free_bytes = xlog_space_left(log, cycle, bytes);
+                do {
+                        ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+                        if (free_bytes < tic->t_unit_res && tail_lsn != 1)
+                                break;
+                        tail_lsn = 0;
+                        free_bytes -= tic->t_unit_res;
+                        sv_signal(&tic->t_sema);
+                        tic = tic->t_next;
+                } while (tic != log->l_write_headq);
+        }
+        if ((tic = log->l_reserve_headq)) {
+#ifdef DEBUG
+                if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+                        panic("Recovery problem");
+#endif
+                cycle = log->l_grant_reserve_cycle;
+                bytes = log->l_grant_reserve_bytes;
+                free_bytes = xlog_space_left(log, cycle, bytes);
+                do {
+                        if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+                                need_bytes = tic->t_unit_res*tic->t_cnt;
+                        else
+                                need_bytes = tic->t_unit_res;
+                        if (free_bytes < need_bytes && tail_lsn != 1)
+                                break;
+                        tail_lsn = 0;
+                        free_bytes -= need_bytes;
+                        sv_signal(&tic->t_sema);
+                        tic = tic->t_next;
+                } while (tic != log->l_reserve_headq);
+        }
+        GRANT_UNLOCK(log, s);
+}       /* xfs_log_move_tail */
+/*
+ * Determine if we have a transaction that has gone to disk
+ * that needs to be covered. Log activity needs to be idle (no AIL and
+ * nothing in the iclogs). And, we need to be in the right state indicating
+ * something has gone out.
+ */
+int
+xfs_log_need_covered(xfs_mount_t *mp)
+{
+        SPLDECL(s);
+        int             needed = 0, gen;
+        xlog_t          *log = mp->m_log;
+        vfs_t           *vfsp = XFS_MTOVFS(mp);
+        if (fs_frozen(vfsp) || XFS_FORCED_SHUTDOWN(mp) ||
+            (vfsp->vfs_flag & VFS_RDONLY))
+                return 0;
+        s = LOG_LOCK(log);
+        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
+                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
+                        && !xfs_trans_first_ail(mp, &gen)
+                        && xlog_iclogs_empty(log)) {
+                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+                        log->l_covered_state = XLOG_STATE_COVER_DONE;
+                else {
+                        ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
+                        log->l_covered_state = XLOG_STATE_COVER_DONE2;
+                }
+                needed = 1;
+        }
+        LOG_UNLOCK(log, s);
+        return(needed);
+}
+/******************************************************************************
+ *
+ *      local routines
+ *
+ ******************************************************************************
+ */
+/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
+ * The log manager must keep track of the last LR which was committed
+ * to disk.  The lsn of this LR will become the new tail_lsn whenever
+ * xfs_trans_tail_ail returns 0.  If we don't do this, we run into
+ * the situation where stuff could be written into the log but nothing
+ * was ever in the AIL when asked.  Eventually, we panic since the
+ * tail hits the head.
+ *
+ * We may be holding the log iclog lock upon entering this routine.
+ */
+xfs_lsn_t
+xlog_assign_tail_lsn(xfs_mount_t *mp)
+{
+        xfs_lsn_t tail_lsn;
+        SPLDECL(s);
+        xlog_t    *log = mp->m_log;
+        tail_lsn = xfs_trans_tail_ail(mp);
+        s = GRANT_LOCK(log);
+        if (tail_lsn != 0) {
+                log->l_tail_lsn = tail_lsn;
+        } else {
+                tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
+        }
+        GRANT_UNLOCK(log, s);
+        return tail_lsn;
+}       /* xlog_assign_tail_lsn */
+/*
+ * Return the space in the log between the tail and the head.  The head
+ * is passed in the cycle/bytes formal parms.  In the special case where
+ * the reserve head has wrapped passed the tail, this calculation is no
+ * longer valid.  In this case, just return 0 which means there is no space
+ * in the log.  This works for all places where this function is called
+ * with the reserve head.  Of course, if the write head were to ever
+ * wrap the tail, we should blow up.  Rather than catch this case here,
+ * we depend on other ASSERTions in other parts of the code.   XXXmiken
+ *
+ * This code also handles the case where the reservation head is behind
+ * the tail.  The details of this case are described below, but the end
+ * result is that we return the size of the log as the amount of space left.
+ */
+int
+xlog_space_left(xlog_t *log, int cycle, int bytes)
+{
+        int free_bytes;
+        int tail_bytes;
+        int tail_cycle;
+        tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn));
+        tail_cycle = CYCLE_LSN(log->l_tail_lsn);
+        if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+                free_bytes = log->l_logsize - (bytes - tail_bytes);
+        } else if ((tail_cycle + 1) < cycle) {
+                return 0;
+        } else if (tail_cycle < cycle) {
+                ASSERT(tail_cycle == (cycle - 1));
+                free_bytes = tail_bytes - bytes;
+        } else {
+                /*
+                 * The reservation head is behind the tail.
+                 * In this case we just want to return the size of the
+                 * log as the amount of space left.
+                 */
+                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+                        "xlog_space_left: head behind tail\n"
+                        "  tail_cycle = %d, tail_bytes = %d\n"
+                        "  GH   cycle = %d, GH   bytes = %d",
+                        tail_cycle, tail_bytes, cycle, bytes);
+                ASSERT(0);
+                free_bytes = log->l_logsize;
+        }
+        return free_bytes;
+}       /* xlog_space_left */
+/*
+ * Log function which is called when an io completes.
+ *
+ * The log manager needs its own routine, in order to control what
+ * happens with the buffer after the write completes.
+ */
+void
+xlog_iodone(xfs_buf_t *bp)
+{
+        xlog_in_core_t  *iclog;
+        xlog_t          *l;
+        int             aborted;
+        iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
+        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+        aborted = 0;
+        /*
+         * Some versions of cpp barf on the recursive definition of
+         * ic_log -> hic_fields.ic_log and expand ic_log twice when
+         * it is passed through two macros.  Workaround broken cpp.
+         */
+        l = iclog->ic_log;
+        /*
+         * Race to shutdown the filesystem if we see an error.
+         */
+        if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp,
+                        XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+                xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp));
+                XFS_BUF_STALE(bp);
+                xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
+                /*
+                 * This flag will be propagated to the trans-committed
+                 * callback routines to let them know that the log-commit
+                 * didn't succeed.
+                 */
+                aborted = XFS_LI_ABORTED;
+        } else if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                aborted = XFS_LI_ABORTED;
+        }
+        xlog_state_done_syncing(iclog, aborted);
+        if (!(XFS_BUF_ISASYNC(bp))) {
+                /*
+                 * Corresponding psema() will be done in bwrite().  If we don't
+                 * vsema() here, panic.
+                 */
+                XFS_BUF_V_IODONESEMA(bp);
+        }
+}       /* xlog_iodone */
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat_cb(struct xfs_buf *bp)
+{
+        xlog_in_core_t *iclog;
+        iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+        if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
+          /* note for irix bstrat will need  struct bdevsw passed
+           * Fix the following macro if the code ever is merged
+           */
+            XFS_bdstrat(bp);
+                return 0;
+        }
+        xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
+        XFS_BUF_ERROR(bp, EIO);
+        XFS_BUF_STALE(bp);
+        xfs_biodone(bp);
+        return (XFS_ERROR(EIO));
+}
+/*
+ * Return size of each in-core log record buffer.
+ *
+ * Low memory machines only get 2 16KB buffers.  We don't want to waste
+ * memory here.  However, all other machines get at least 2 32KB buffers.
+ * The number is hard coded because we don't care about the minimum
+ * memory size, just 32MB systems.
+ *
+ * If the filesystem blocksize is too large, we may need to choose a
+ * larger size since the directory code currently logs entire blocks.
+ */
+STATIC void
+xlog_get_iclog_buffer_size(xfs_mount_t  *mp,
+                           xlog_t       *log)
+{
+        int size;
+        int xhdrs;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+        /*
+         * When logbufs == 0, someone has disabled the log from the FSTAB
+         * file.  This is not a documented feature.  We need to set xlog_debug
+         * to zero (this deactivates the log) and set xlog_target to the
+         * appropriate device.  Only one filesystem may be affected as such
+         * since this is just a performance hack to test what we might be able
+         * to get if the log were not present.
+         */
+        if (mp->m_logbufs == 0) {
+                xlog_debug = 0;
+                xlog_target = log->l_targ;
+                log->l_iclog_bufs = XLOG_MIN_ICLOGS;
+        } else
+#endif
+        {
+                /*
+                 * This is the normal path.  If m_logbufs == -1, then the
+                 * admin has chosen to use the system defaults for logbuffers.
+                 */
+                if (mp->m_logbufs == -1) { 
+                        if (xfs_physmem <= btoc(128*1024*1024)) { 
+                                log->l_iclog_bufs = XLOG_MIN_ICLOGS; 
+                        } else if (xfs_physmem <= btoc(400*1024*1024)) { 
+                                log->l_iclog_bufs = XLOG_MED_ICLOGS; 
+                        } else {
+                                /* 256K with 32K bufs */
+                                log->l_iclog_bufs = XLOG_MAX_ICLOGS;
+                        }
+                } else
+                        log->l_iclog_bufs = mp->m_logbufs;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+                /* We are reactivating a filesystem after it was inactive */
+                if (log->l_targ == xlog_target) {
+                        xlog_target = NULL;
+                        xlog_debug = 1;
+                }
+#endif
+        }
+        /*
+         * Buffer size passed in from mount system call.
+         */
+        if (mp->m_logbsize != -1) {
+                size = log->l_iclog_size = mp->m_logbsize;
+                log->l_iclog_size_log = 0;
+                while (size != 1) {
+                        log->l_iclog_size_log++;
+                        size >>= 1;
+                }
+                if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+                        /* # headers = size / 32K
+                         * one header holds cycles from 32K of data
+                         */
+                        xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
+                        if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
+                                xhdrs++;
+                        log->l_iclog_hsize = xhdrs << BBSHIFT;
+                        log->l_iclog_heads = xhdrs;
+                } else {
+                        ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
+                        log->l_iclog_hsize = BBSIZE;
+                        log->l_iclog_heads = 1;
+                }
+                return;
+        }
+        /*
+         * Special case machines that have less than 32MB of memory.
+         * All machines with more memory use 32KB buffers.
+         */
+        if (xfs_physmem <= btoc(32*1024*1024)) {
+                /* Don't change; min configuration */
+                log->l_iclog_size = XLOG_RECORD_BSIZE;          /* 16k */
+                log->l_iclog_size_log = XLOG_RECORD_BSHIFT;
+        } else {
+                log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;      /* 32k */
+                log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+        }
+        /* the default log size is 16k or 32k which is one header sector */
+        log->l_iclog_hsize = BBSIZE;
+        log->l_iclog_heads = 1;
+        /*
+         * For 16KB, we use 3 32KB buffers.  For 32KB block sizes, we use
+         * 4 32KB buffers.  For 64KB block sizes, we use 8 32KB buffers.
+         */
+        if (mp->m_sb.sb_blocksize >= 16*1024) {
+                log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
+                log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+                if (mp->m_logbufs == -1) {
+                        switch (mp->m_sb.sb_blocksize) {
+                            case 16*1024:                       /* 16 KB */
+                                log->l_iclog_bufs = 3;
+                                break;
+                            case 32*1024:                       /* 32 KB */
+                                log->l_iclog_bufs = 4;
+                                break;
+                            case 64*1024:                       /* 64 KB */
+                                log->l_iclog_bufs = 8;
+                                break;
+                            default:
+                                xlog_panic("XFS: Invalid blocksize");
+                                break;
+                        }
+                }
+        }
+}       /* xlog_get_iclog_buffer_size */
+/*
+ * This routine initializes some of the log structure for a given mount point.
+ * Its primary purpose is to fill in enough, so recovery can occur.  However,
+ * some other stuff may be filled in too.
+ */
+STATIC xlog_t *
+xlog_alloc_log(xfs_mount_t      *mp,
+               xfs_buftarg_t    *log_target,
+               xfs_daddr_t      blk_offset,
+               int              num_bblks)
+{
+        xlog_t                  *log;
+        xlog_rec_header_t       *head;
+        xlog_in_core_t          **iclogp;
+        xlog_in_core_t          *iclog, *prev_iclog=NULL;
+        xfs_buf_t               *bp;
+        int                     i;
+        int                     iclogsize;
+        log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
+        log->l_mp          = mp;
+        log->l_targ        = log_target;
+        log->l_logsize     = BBTOB(num_bblks);
+        log->l_logBBstart  = blk_offset;
+        log->l_logBBsize   = num_bblks;
+        log->l_covered_state = XLOG_STATE_COVER_IDLE;
+        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
+        log->l_prev_block  = -1;
+        ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0);
+        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
+        log->l_last_sync_lsn = log->l_tail_lsn;
+        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
+        log->l_grant_reserve_cycle = 1;
+        log->l_grant_write_cycle = 1;
+        if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) {
+                log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
+                ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
+                /* for larger sector sizes, must have v2 or external log */
+                ASSERT(log->l_sectbb_log == 0 ||
+                        log->l_logBBstart == 0 ||
+                        XFS_SB_VERSION_HASLOGV2(&mp->m_sb));
+                ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
+        }
+        log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
+        xlog_get_iclog_buffer_size(mp, log);
+        bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+        XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+        XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
+        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+        log->l_xbuf = bp;
+        spinlock_init(&log->l_icloglock, "iclog");
+        spinlock_init(&log->l_grant_lock, "grhead_iclog");
+        initnsema(&log->l_flushsema, 0, "ic-flush");
+        xlog_state_ticket_alloc(log);  /* wait until after icloglock inited */
+        /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
+        ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
+        iclogp = &log->l_iclog;
+        /*
+         * The amount of memory to allocate for the iclog structure is
+         * rather funky due to the way the structure is defined.  It is
+         * done this way so that we can use different sizes for machines
+         * with different amounts of memory.  See the definition of
+         * xlog_in_core_t in xfs_log_priv.h for details.
+         */
+        iclogsize = log->l_iclog_size;
+        ASSERT(log->l_iclog_size >= 4096);
+        for (i=0; i < log->l_iclog_bufs; i++) {
+                *iclogp = (xlog_in_core_t *)
+                          kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
+                iclog = *iclogp;
+                iclog->hic_data = (xlog_in_core_2_t *)
+                          kmem_zalloc(iclogsize, KM_SLEEP);
+                iclog->ic_prev = prev_iclog;
+                prev_iclog = iclog;
+                log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+                head = &iclog->ic_header;
+                memset(head, 0, sizeof(xlog_rec_header_t));
+                INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
+                INT_SET(head->h_version, ARCH_CONVERT,
+                        XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+                INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size);
+                /* new fields */
+                INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT);
+                memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+                bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
+                XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+                XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
+                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+                iclog->ic_bp = bp;
+                iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
+                iclog->ic_state = XLOG_STATE_ACTIVE;
+                iclog->ic_log = log;
+                iclog->ic_callback_tail = &(iclog->ic_callback);
+                iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
+                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
+                sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
+                sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
+                iclogp = &iclog->ic_next;
+        }
+        *iclogp = log->l_iclog;                 /* complete ring */
+        log->l_iclog->ic_prev = prev_iclog;     /* re-write 1st prev ptr */
+        return log;
+}       /* xlog_alloc_log */
+/*
+ * Write out the commit record of a transaction associated with the given
+ * ticket.  Return the lsn of the commit record.
+ */
+STATIC int
+xlog_commit_record(xfs_mount_t  *mp,
+                   xlog_ticket_t *ticket,
+                   xlog_in_core_t **iclog,
+                   xfs_lsn_t    *commitlsnp)
+{
+        int             error;
+        xfs_log_iovec_t reg[1];
+        reg[0].i_addr = NULL;
+        reg[0].i_len = 0;
+        ASSERT_ALWAYS(iclog);
+        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
+                               iclog, XLOG_COMMIT_TRANS))) {
+                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+        }
+        return (error);
+}       /* xlog_commit_record */
+/*
+ * Push on the buffer cache code if we ever use more than 75% of the on-disk
+ * log space.  This code pushes on the lsn which would supposedly free up
+ * the 25% which we want to leave free.  We may need to adopt a policy which
+ * pushes on an lsn which is further along in the log once we reach the high
+ * water mark.  In this manner, we would be creating a low water mark.
+ */
+void
+xlog_grant_push_ail(xfs_mount_t *mp,
+                    int         need_bytes)
+{
+    xlog_t      *log = mp->m_log;       /* pointer to the log */
+    xfs_lsn_t   tail_lsn;               /* lsn of the log tail */
+    xfs_lsn_t   threshold_lsn = 0;      /* lsn we'd like to be at */
+    int         free_blocks;            /* free blocks left to write to */
+    int         free_bytes;             /* free bytes left to write to */
+    int         threshold_block;        /* block in lsn we'd like to be at */
+    int         threshold_cycle;        /* lsn cycle we'd like to be at */
+    int         free_threshold;
+    SPLDECL(s);
+    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+    s = GRANT_LOCK(log);
+    free_bytes = xlog_space_left(log,
+                                 log->l_grant_reserve_cycle,
+                                 log->l_grant_reserve_bytes);
+    tail_lsn = log->l_tail_lsn;
+    free_blocks = BTOBBT(free_bytes);
+    /*
+     * Set the threshold for the minimum number of free blocks in the
+     * log to the maximum of what the caller needs, one quarter of the
+     * log, and 256 blocks.
+     */
+    free_threshold = BTOBB(need_bytes);
+    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+    free_threshold = MAX(free_threshold, 256);
+    if (free_blocks < free_threshold) {
+        threshold_block = BLOCK_LSN(tail_lsn) + free_threshold;
+        threshold_cycle = CYCLE_LSN(tail_lsn);
+        if (threshold_block >= log->l_logBBsize) {
+            threshold_block -= log->l_logBBsize;
+            threshold_cycle += 1;
+        }
+        ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle,
+                       threshold_block);
+        /* Don't pass in an lsn greater than the lsn of the last
+         * log record known to be on disk.
+         */
+        if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0)
+            threshold_lsn = log->l_last_sync_lsn;
+    }
+    GRANT_UNLOCK(log, s);
+    /*
+     * Get the transaction layer to kick the dirty buffers out to
+     * disk asynchronously. No point in trying to do this if
+     * the filesystem is shutting down.
+     */
+    if (threshold_lsn &&
+        !XLOG_FORCED_SHUTDOWN(log))
+            xfs_trans_push_ail(mp, threshold_lsn);
+}       /* xlog_grant_push_ail */
+/*
+ * Flush out the in-core log (iclog) to the on-disk log in an asynchronous 
+ * fashion.  Previously, we should have moved the current iclog
+ * ptr in the log to point to the next available iclog.  This allows further
+ * write to continue while this code syncs out an iclog ready to go.
+ * Before an in-core log can be written out, the data section must be scanned
+ * to save away the 1st word of each BBSIZE block into the header.  We replace
+ * it with the current cycle count.  Each BBSIZE block is tagged with the
+ * cycle count because there in an implicit assumption that drives will
+ * guarantee that entire 512 byte blocks get written at once.  In other words,
+ * we can't have part of a 512 byte block written and part not written.  By
+ * tagging each block, we will know which blocks are valid when recovering
+ * after an unclean shutdown.
+ *
+ * This routine is single threaded on the iclog.  No other thread can be in
+ * this routine with the same iclog.  Changing contents of iclog can there-
+ * fore be done without grabbing the state machine lock.  Updating the global
+ * log will require grabbing the lock though.
+ *
+ * The entire log manager uses a logical block numbering scheme.  Only
+ * log_sync (and then only bwrite()) know about the fact that the log may
+ * not start with block zero on a given device.  The log block start offset
+ * is added immediately before calling bwrite().
+ */
+int
+xlog_sync(xlog_t                *log,
+          xlog_in_core_t        *iclog)
+{
+        xfs_caddr_t     dptr;           /* pointer to byte sized element */
+        xfs_buf_t       *bp;
+        int             i, ops;
+        uint            count;          /* byte count of bwrite */
+        uint            count_init;     /* initial count before roundup */
+        int             roundoff;       /* roundoff to BB or stripe */
+        int             split = 0;      /* split write into two regions */
+        int             error;
+        SPLDECL(s);
+        int             v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb);
+        XFS_STATS_INC(xs_log_writes);
+        ASSERT(iclog->ic_refcnt == 0);
+        /* Add for LR header */
+        count_init = log->l_iclog_hsize + iclog->ic_offset;
+        /* Round out the log write size */
+        if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
+                /* we have a v2 stripe unit to use */
+                count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+        } else {
+                count = BBTOB(BTOBB(count_init));
+        }
+        roundoff = count - count_init;
+        ASSERT(roundoff >= 0);
+        ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && 
+                roundoff < log->l_mp->m_sb.sb_logsunit)
+                || 
+                (log->l_mp->m_sb.sb_logsunit <= 1 && 
+                 roundoff < BBTOB(1)));
+        /* move grant heads by roundoff in sync */
+        s = GRANT_LOCK(log);
+        XLOG_GRANT_ADD_SPACE(log, roundoff, 'w');
+        XLOG_GRANT_ADD_SPACE(log, roundoff, 'r');
+        GRANT_UNLOCK(log, s);
+        /* put cycle number in every block */
+        xlog_pack_data(log, iclog, roundoff); 
+        /* real byte length */
+        if (v2) {
+                INT_SET(iclog->ic_header.h_len, 
+                        ARCH_CONVERT,
+                        iclog->ic_offset + roundoff);
+        } else {
+                INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);
+        }
+        /* put ops count in correct order */
+        ops = iclog->ic_header.h_num_logops;
+        INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops);
+        bp          = iclog->ic_bp;
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
+        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+        XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)));
+        XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+        /* Do we need to split this write into 2 parts? */
+        if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+                split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
+                count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
+                iclog->ic_bwritecnt = 2;        /* split into 2 writes */
+        } else {
+                iclog->ic_bwritecnt = 1;
+        }
+        XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count);
+        XFS_BUF_SET_FSPRIVATE(bp, iclog);       /* save for later */
+        XFS_BUF_BUSY(bp);
+        XFS_BUF_ASYNC(bp);
+        /*
+         * Do a disk write cache flush for the log block.
+         * This is a bit of a sledgehammer, it would be better
+         * to use a tag barrier here that just prevents reordering.
+         * It may not be needed to flush the first split block in the log wrap
+         * case, but do it anyways to be safe -AK
+         */
+        if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
+                XFS_BUF_FLUSH(bp);
+        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+        ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+        xlog_verify_iclog(log, iclog, count, B_TRUE);
+        /* account for log which doesn't start at block #0 */
+        XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+        /*
+         * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
+         * is shutting down.
+         */
+        XFS_BUF_WRITE(bp);
+        if ((error = XFS_bwrite(bp))) {
+                xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
+                                  XFS_BUF_ADDR(bp));
+                return (error);
+        }
+        if (split) {
+                bp              = iclog->ic_log->l_xbuf;
+                ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
+                                                        (unsigned long)1);
+                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+                XFS_BUF_SET_ADDR(bp, 0);             /* logical 0 */
+                XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
+                                            (__psint_t)count), split);
+                XFS_BUF_SET_FSPRIVATE(bp, iclog);
+                XFS_BUF_BUSY(bp);
+                XFS_BUF_ASYNC(bp);
+                if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
+                        XFS_BUF_FLUSH(bp);
+                dptr = XFS_BUF_PTR(bp);
+                /*
+                 * Bump the cycle numbers at the start of each block
+                 * since this part of the buffer is at the start of
+                 * a new cycle.  Watch out for the header magic number
+                 * case, though.
+                 */
+                for (i=0; i<split; i += BBSIZE) {
+                        INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
+                        if (INT_GET(*(uint *)dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+                                INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
+                        dptr += BBSIZE;
+                }
+                ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+                ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+                /* account for internal log which does't start at block #0 */
+                XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+                XFS_BUF_WRITE(bp);
+                if ((error = XFS_bwrite(bp))) {
+                        xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
+                                          bp, XFS_BUF_ADDR(bp));
+                        return (error);
+                }
+        }
+        return (0);
+}       /* xlog_sync */
+/*
+ * Unallocate a log structure
+ */
+void
+xlog_unalloc_log(xlog_t *log)
+{
+        xlog_in_core_t  *iclog, *next_iclog;
+        xlog_ticket_t   *tic, *next_tic;
+        int             i;
+        iclog = log->l_iclog;
+        for (i=0; i<log->l_iclog_bufs; i++) {
+                sv_destroy(&iclog->ic_forcesema);
+                sv_destroy(&iclog->ic_writesema);
+                xfs_buf_free(iclog->ic_bp);
+#ifdef XFS_LOG_TRACE
+                if (iclog->ic_trace != NULL) {
+                        ktrace_free(iclog->ic_trace);
+                }
+#endif
+                next_iclog = iclog->ic_next;
+                kmem_free(iclog->hic_data, log->l_iclog_size);
+                kmem_free(iclog, sizeof(xlog_in_core_t));
+                iclog = next_iclog;
+        }
+        freesema(&log->l_flushsema);
+        spinlock_destroy(&log->l_icloglock);
+        spinlock_destroy(&log->l_grant_lock);
+        /* XXXsup take a look at this again. */
+        if ((log->l_ticket_cnt != log->l_ticket_tcnt)  &&
+            !XLOG_FORCED_SHUTDOWN(log)) {
+                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        "xlog_unalloc_log: (cnt: %d, total: %d)",
+                        log->l_ticket_cnt, log->l_ticket_tcnt);
+                /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
+        } else {
+                tic = log->l_unmount_free;
+                while (tic) {
+                        next_tic = tic->t_next;
+                        kmem_free(tic, NBPP);
+                        tic = next_tic;
+                }
+        }
+        xfs_buf_free(log->l_xbuf);
+#ifdef XFS_LOG_TRACE
+        if (log->l_trace != NULL) {
+                ktrace_free(log->l_trace);
+        }
+        if (log->l_grant_trace != NULL) {
+                ktrace_free(log->l_grant_trace);
+        }
+#endif
+        log->l_mp->m_log = NULL;
+        kmem_free(log, sizeof(xlog_t));
+}       /* xlog_unalloc_log */
+/*
+ * Update counters atomically now that memcpy is done.
+ */
+/* ARGSUSED */
+static inline void
+xlog_state_finish_copy(xlog_t           *log,
+                       xlog_in_core_t   *iclog,
+                       int              record_cnt,
+                       int              copy_bytes)
+{
+        SPLDECL(s);
+        s = LOG_LOCK(log);
+        iclog->ic_header.h_num_logops += record_cnt;
+        iclog->ic_offset += copy_bytes;
+        LOG_UNLOCK(log, s);
+}       /* xlog_state_finish_copy */
+/*
+ * Write some region out to in-core log
+ *
+ * This will be called when writing externally provided regions or when
+ * writing out a commit record for a given transaction.
+ *
+ * General algorithm:
+ *      1. Find total length of this write.  This may include adding to the
+ *              lengths passed in.
+ *      2. Check whether we violate the tickets reservation.
+ *      3. While writing to this iclog
+ *          A. Reserve as much space in this iclog as can get
+ *          B. If this is first write, save away start lsn
+ *          C. While writing this region:
+ *              1. If first write of transaction, write start record
+ *              2. Write log operation header (header per region)
+ *              3. Find out if we can fit entire region into this iclog
+ *              4. Potentially, verify destination memcpy ptr
+ *              5. Memcpy (partial) region
+ *              6. If partial copy, release iclog; otherwise, continue
+ *                      copying more regions into current iclog
+ *      4. Mark want sync bit (in simulation mode)
+ *      5. Release iclog for potential flush to on-disk log.
+ *
+ * ERRORS:
+ * 1.   Panic if reservation is overrun.  This should never happen since
+ *      reservation amounts are generated internal to the filesystem.
+ * NOTES:
+ * 1. Tickets are single threaded data structures.
+ * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
+ *      syncing routine.  When a single log_write region needs to span
+ *      multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
+ *      on all log operation writes which don't contain the end of the
+ *      region.  The XLOG_END_TRANS bit is used for the in-core log
+ *      operation which contains the end of the continued log_write region.
+ * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
+ *      we don't really know exactly how much space will be used.  As a result,
+ *      we don't update ic_offset until the end when we know exactly how many
+ *      bytes have been written out.
+ */
+int
+xlog_write(xfs_mount_t *        mp,
+           xfs_log_iovec_t      reg[],
+           int                  nentries,
+           xfs_log_ticket_t     tic,
+           xfs_lsn_t            *start_lsn,
+           xlog_in_core_t       **commit_iclog,
+           uint                 flags)
+{
+    xlog_t           *log    = mp->m_log;
+    xlog_ticket_t    *ticket = (xlog_ticket_t *)tic;
+    xlog_op_header_t *logop_head;    /* ptr to log operation header */
+    xlog_in_core_t   *iclog;         /* ptr to current in-core log */
+    __psint_t        ptr;            /* copy address into data region */
+    int              len;            /* # xlog_write() bytes 2 still copy */
+    int              index;          /* region index currently copying */
+    int              log_offset;     /* offset (from 0) into data region */
+    int              start_rec_copy; /* # bytes to copy for start record */
+    int              partial_copy;   /* did we split a region? */
+    int              partial_copy_len;/* # bytes copied if split region */
+    int              need_copy;      /* # bytes need to memcpy this region */
+    int              copy_len;       /* # bytes actually memcpy'ing */
+    int              copy_off;       /* # bytes from entry start */
+    int              contwr;         /* continued write of in-core log? */
+    int              error;
+    int              record_cnt = 0, data_cnt = 0;
+    partial_copy_len = partial_copy = 0;
+    /* Calculate potential maximum space.  Each region gets its own
+     * xlog_op_header_t and may need to be double word aligned.
+     */
+    len = 0;
+    if (ticket->t_flags & XLOG_TIC_INITED)     /* acct for start rec of xact */
+        len += sizeof(xlog_op_header_t);
+    for (index = 0; index < nentries; index++) {
+        len += sizeof(xlog_op_header_t);            /* each region gets >= 1 */
+        len += reg[index].i_len;
+    }
+    contwr = *start_lsn = 0;
+    if (ticket->t_curr_res < len) {
+#ifdef DEBUG
+        xlog_panic(
+                "xfs_log_write: reservation ran out. Need to up reservation");
+#else
+        /* Customer configurable panic */
+        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+                "xfs_log_write: reservation ran out. Need to up reservation");
+        /* If we did not panic, shutdown the filesystem */
+        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+#endif
+    } else
+        ticket->t_curr_res -= len;
+    for (index = 0; index < nentries; ) {
+        if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+                                               &contwr, &log_offset)))
+                return (error);
+        ASSERT(log_offset <= iclog->ic_size - 1);
+        ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+        /* start_lsn is the first lsn written to. That's all we need. */
+        if (! *start_lsn)
+            *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+        /* This loop writes out as many regions as can fit in the amount
+         * of space which was allocated by xlog_state_get_iclog_space().
+         */
+        while (index < nentries) {
+            ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
+            ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
+            start_rec_copy = 0;
+            /* If first write for transaction, insert start record.
+             * We can't be trying to commit if we are inited.  We can't
+             * have any "partial_copy" if we are inited.
+             */
+            if (ticket->t_flags & XLOG_TIC_INITED) {
+                logop_head              = (xlog_op_header_t *)ptr;
+                INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
+                logop_head->oh_clientid = ticket->t_clientid;
+                logop_head->oh_len      = 0;
+                logop_head->oh_flags    = XLOG_START_TRANS;
+                logop_head->oh_res2     = 0;
+                ticket->t_flags         &= ~XLOG_TIC_INITED;    /* clear bit */
+                record_cnt++;
+                start_rec_copy = sizeof(xlog_op_header_t);
+                xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
+            }
+            /* Copy log operation header directly into data section */
+            logop_head                  = (xlog_op_header_t *)ptr;
+            INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
+            logop_head->oh_clientid     = ticket->t_clientid;
+            logop_head->oh_res2         = 0;
+            /* header copied directly */
+            xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+            /* are we copying a commit or unmount record? */
+            logop_head->oh_flags = flags;
+            /*
+             * We've seen logs corrupted with bad transaction client
+             * ids.  This makes sure that XFS doesn't generate them on.
+             * Turn this into an EIO and shut down the filesystem.
+             */
+            switch (logop_head->oh_clientid)  {
+            case XFS_TRANSACTION:
+            case XFS_VOLUME:
+            case XFS_LOG:
+                break;
+            default:
+                xfs_fs_cmn_err(CE_WARN, mp,
+                    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+                    logop_head->oh_clientid, tic);
+                return XFS_ERROR(EIO);
+            }
+            /* Partial write last time? => (partial_copy != 0)
+             * need_copy is the amount we'd like to copy if everything could
+             * fit in the current memcpy.
+             */
+            need_copy = reg[index].i_len - partial_copy_len;
+            copy_off = partial_copy_len;
+            if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
+                INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy);
+                if (partial_copy)
+                    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+                partial_copy_len = partial_copy = 0;
+            } else {                                        /* partial write */
+                copy_len = iclog->ic_size - log_offset;
+                INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len);
+                logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
+                if (partial_copy)
+                        logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
+                partial_copy_len += copy_len;
+                partial_copy++;
+                len += sizeof(xlog_op_header_t); /* from splitting of region */
+                /* account for new log op header */
+                ticket->t_curr_res -= sizeof(xlog_op_header_t);
+            }
+            xlog_verify_dest_ptr(log, ptr);
+            /* copy region */
+            ASSERT(copy_len >= 0);
+            memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len);
+            xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
+            /* make copy_len total bytes copied, including headers */
+            copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+            record_cnt++;
+            data_cnt += contwr ? copy_len : 0;
+            if (partial_copy) {                 /* copied partial region */
+                    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
+                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+                    record_cnt = data_cnt = 0;
+                    if ((error = xlog_state_release_iclog(log, iclog)))
+                            return (error);
+                    break;                      /* don't increment index */
+            } else {                            /* copied entire region */
+                index++;
+                partial_copy_len = partial_copy = 0;
+                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+                    record_cnt = data_cnt = 0;
+                    xlog_state_want_sync(log, iclog);
+                    if (commit_iclog) {
+                        ASSERT(flags & XLOG_COMMIT_TRANS);
+                        *commit_iclog = iclog;
+                    } else if ((error = xlog_state_release_iclog(log, iclog)))
+                           return (error);
+                    if (index == nentries)
+                            return 0;           /* we are done */
+                    else
+                            break;
+                }
+            } /* if (partial_copy) */
+        } /* while (index < nentries) */
+    } /* for (index = 0; index < nentries; ) */
+    ASSERT(len == 0);
+    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+    if (commit_iclog) {
+        ASSERT(flags & XLOG_COMMIT_TRANS);
+        *commit_iclog = iclog;
+        return 0;
+    }
+    return (xlog_state_release_iclog(log, iclog));
+}       /* xlog_write */
+/*****************************************************************************
+ *
+ *              State Machine functions
+ *
+ *****************************************************************************
+ */
+/* Clean iclogs starting from the head.  This ordering must be
+ * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * is SYNCING.  This is also required to maintain the notion that we use
+ * a counting semaphore to hold off would be writers to the log when every
+ * iclog is trying to sync to disk.
+ *
+ * State Change: DIRTY -> ACTIVE
+ */
+void
+xlog_state_clean_log(xlog_t *log)
+{
+        xlog_in_core_t  *iclog;
+        int changed = 0;
+        iclog = log->l_iclog;
+        do {
+                if (iclog->ic_state == XLOG_STATE_DIRTY) {
+                        iclog->ic_state = XLOG_STATE_ACTIVE;
+                        iclog->ic_offset       = 0;
+                        iclog->ic_callback      = NULL;   /* don't need to free */
+                        /*
+                         * If the number of ops in this iclog indicate it just
+                         * contains the dummy transaction, we can
+                         * change state into IDLE (the second time around).
+                         * Otherwise we should change the state into
+                         * NEED a dummy.
+                         * We don't need to cover the dummy.
+                         */
+                        if (!changed &&
+                           (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) {
+                                changed = 1;
+                        } else {
+                                /*
+                                 * We have two dirty iclogs so start over
+                                 * This could also be num of ops indicates
+                                 * this is not the dummy going out.
+                                 */
+                                changed = 2;
+                        }
+                        iclog->ic_header.h_num_logops = 0;
+                        memset(iclog->ic_header.h_cycle_data, 0,
+                              sizeof(iclog->ic_header.h_cycle_data));
+                        iclog->ic_header.h_lsn = 0;
+                } else if (iclog->ic_state == XLOG_STATE_ACTIVE)
+                        /* do nothing */;
+                else
+                        break;  /* stop cleaning */
+                iclog = iclog->ic_next;
+        } while (iclog != log->l_iclog);
+        /* log is locked when we are called */
+        /*
+         * Change state for the dummy log recording.
+         * We usually go to NEED. But we go to NEED2 if the changed indicates
+         * we are done writing the dummy record.
+         * If we are done with the second dummy recored (DONE2), then
+         * we go to IDLE.
+         */
+        if (changed) {
+                switch (log->l_covered_state) {
+                case XLOG_STATE_COVER_IDLE:
+                case XLOG_STATE_COVER_NEED:
+                case XLOG_STATE_COVER_NEED2:
+                        log->l_covered_state = XLOG_STATE_COVER_NEED;
+                        break;
+                case XLOG_STATE_COVER_DONE:
+                        if (changed == 1)
+                                log->l_covered_state = XLOG_STATE_COVER_NEED2;
+                        else
+                                log->l_covered_state = XLOG_STATE_COVER_NEED;
+                        break;
+                case XLOG_STATE_COVER_DONE2:
+                        if (changed == 1)
+                                log->l_covered_state = XLOG_STATE_COVER_IDLE;
+                        else
+                                log->l_covered_state = XLOG_STATE_COVER_NEED;
+                        break;
+                default:
+                        ASSERT(0);
+                }
+        }
+}       /* xlog_state_clean_log */
+STATIC xfs_lsn_t
+xlog_get_lowest_lsn(
+        xlog_t          *log)
+{
+        xlog_in_core_t  *lsn_log;
+        xfs_lsn_t       lowest_lsn, lsn;
+        lsn_log = log->l_iclog;
+        lowest_lsn = 0;
+        do {
+            if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
+                lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT);
+                if ((lsn && !lowest_lsn) ||
+                    (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+                        lowest_lsn = lsn;
+                }
+            }
+            lsn_log = lsn_log->ic_next;
+        } while (lsn_log != log->l_iclog);
+        return(lowest_lsn);
+}
+STATIC void
+xlog_state_do_callback(
+        xlog_t          *log,
+        int             aborted,
+        xlog_in_core_t  *ciclog)
+{
+        xlog_in_core_t     *iclog;
+        xlog_in_core_t     *first_iclog;        /* used to know when we've
+                                                 * processed all iclogs once */
+        xfs_log_callback_t *cb, *cb_next;
+        int                flushcnt = 0;
+        xfs_lsn_t          lowest_lsn;
+        int                ioerrors;    /* counter: iclogs with errors */
+        int                loopdidcallbacks; /* flag: inner loop did callbacks*/
+        int                funcdidcallbacks; /* flag: function did callbacks */
+        int                repeats;     /* for issuing console warnings if
+                                         * looping too many times */
+        SPLDECL(s);
+        s = LOG_LOCK(log);
+        first_iclog = iclog = log->l_iclog;
+        ioerrors = 0;
+        funcdidcallbacks = 0;
+        repeats = 0;
+        do {
+                /*
+                 * Scan all iclogs starting with the one pointed to by the
+                 * log.  Reset this starting point each time the log is
+                 * unlocked (during callbacks).
+                 *
+                 * Keep looping through iclogs until one full pass is made
+                 * without running any callbacks.
+                 */
+                first_iclog = log->l_iclog;
+                iclog = log->l_iclog;
+                loopdidcallbacks = 0;
+                repeats++;
+                do {
+                        /* skip all iclogs in the ACTIVE & DIRTY states */
+                        if (iclog->ic_state &
+                            (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+                                iclog = iclog->ic_next;
+                                continue;
+                        }
+                        /*
+                         * Between marking a filesystem SHUTDOWN and stopping
+                         * the log, we do flush all iclogs to disk (if there
+                         * wasn't a log I/O error). So, we do want things to
+                         * go smoothly in case of just a SHUTDOWN  w/o a
+                         * LOG_IO_ERROR.
+                         */
+                        if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+                                /*
+                                 * Can only perform callbacks in order.  Since
+                                 * this iclog is not in the DONE_SYNC/
+                                 * DO_CALLBACK state, we skip the rest and
+                                 * just try to clean up.  If we set our iclog
+                                 * to DO_CALLBACK, we will not process it when
+                                 * we retry since a previous iclog is in the
+                                 * CALLBACK and the state cannot change since
+                                 * we are holding the LOG_LOCK.
+                                 */
+                                if (!(iclog->ic_state &
+                                        (XLOG_STATE_DONE_SYNC |
+                                                 XLOG_STATE_DO_CALLBACK))) {
+                                        if (ciclog && (ciclog->ic_state ==
+                                                        XLOG_STATE_DONE_SYNC)) {
+                                                ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
+                                        }
+                                        break;
+                                }
+                                /*
+                                 * We now have an iclog that is in either the
+                                 * DO_CALLBACK or DONE_SYNC states. The other
+                                 * states (WANT_SYNC, SYNCING, or CALLBACK were
+                                 * caught by the above if and are going to
+                                 * clean (i.e. we aren't doing their callbacks)
+                                 * see the above if.
+                                 */
+                                /*
+                                 * We will do one more check here to see if we
+                                 * have chased our tail around.
+                                 */
+                                lowest_lsn = xlog_get_lowest_lsn(log);
+                                if (lowest_lsn && (
+                                        XFS_LSN_CMP(
+                                                lowest_lsn,
+                                                INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
+                                        )<0)) {
+                                        iclog = iclog->ic_next;
+                                        continue; /* Leave this iclog for
+                                                   * another thread */
+                                }
+                                iclog->ic_state = XLOG_STATE_CALLBACK;
+                                LOG_UNLOCK(log, s);
+                                /* l_last_sync_lsn field protected by
+                                 * GRANT_LOCK. Don't worry about iclog's lsn.
+                                 * No one else can be here except us.
+                                 */
+                                s = GRANT_LOCK(log);
+                                ASSERT(XFS_LSN_CMP(
+                                                log->l_last_sync_lsn,
+                                                INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT)
+                                        )<=0);
+                                log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+                                GRANT_UNLOCK(log, s);
+                                /*
+                                 * Keep processing entries in the callback list
+                                 * until we come around and it is empty.  We
+                                 * need to atomically see that the list is
+                                 * empty and change the state to DIRTY so that
+                                 * we don't miss any more callbacks being added.
+                                 */
+                                s = LOG_LOCK(log);
+                        } else {
+                                ioerrors++;
+                        }
+                        cb = iclog->ic_callback;
+                        while (cb != 0) {
+                                iclog->ic_callback_tail = &(iclog->ic_callback);
+                                iclog->ic_callback = NULL;
+                                LOG_UNLOCK(log, s);
+                                /* perform callbacks in the order given */
+                                for (; cb != 0; cb = cb_next) {
+                                        cb_next = cb->cb_next;
+                                        cb->cb_func(cb->cb_arg, aborted);
+                                }
+                                s = LOG_LOCK(log);
+                                cb = iclog->ic_callback;
+                        }
+                        loopdidcallbacks++;
+                        funcdidcallbacks++;
+                        ASSERT(iclog->ic_callback == 0);
+                        if (!(iclog->ic_state & XLOG_STATE_IOERROR))
+                                iclog->ic_state = XLOG_STATE_DIRTY;
+                        /*
+                         * Transition from DIRTY to ACTIVE if applicable.
+                         * NOP if STATE_IOERROR.
+                         */
+                        xlog_state_clean_log(log);
+                        /* wake up threads waiting in xfs_log_force() */
+                        sv_broadcast(&iclog->ic_forcesema);
+                        iclog = iclog->ic_next;
+                } while (first_iclog != iclog);
+                if (repeats && (repeats % 10) == 0) {
+                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                                "xlog_state_do_callback: looping %d", repeats);
+                }
+        } while (!ioerrors && loopdidcallbacks);
+        /*
+         * make one last gasp attempt to see if iclogs are being left in
+         * limbo..
+         */
+#ifdef DEBUG
+        if (funcdidcallbacks) {
+                first_iclog = iclog = log->l_iclog;
+                do {
+                        ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+                        /*
+                         * Terminate the loop if iclogs are found in states
+                         * which will cause other threads to clean up iclogs.
+                         *
+                         * SYNCING - i/o completion will go through logs
+                         * DONE_SYNC - interrupt thread should be waiting for
+                         *              LOG_LOCK
+                         * IOERROR - give up hope all ye who enter here
+                         */
+                        if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+                            iclog->ic_state == XLOG_STATE_SYNCING ||
+                            iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+                            iclog->ic_state == XLOG_STATE_IOERROR )
+                                break;
+                        iclog = iclog->ic_next;
+                } while (first_iclog != iclog);
+        }
+#endif
+        if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
+                flushcnt = log->l_flushcnt;
+                log->l_flushcnt = 0;
+        }
+        LOG_UNLOCK(log, s);
+        while (flushcnt--)
+                vsema(&log->l_flushsema);
+}       /* xlog_state_do_callback */
+/*
+ * Finish transitioning this iclog to the dirty state.
+ *
+ * Make sure that we completely execute this routine only when this is
+ * the last call to the iclog.  There is a good chance that iclog flushes,
+ * when we reach the end of the physical log, get turned into 2 separate
+ * calls to bwrite.  Hence, one iclog flush could generate two calls to this
+ * routine.  By using the reference count bwritecnt, we guarantee that only
+ * the second completion goes through.
+ *
+ * Callbacks could take time, so they are done outside the scope of the
+ * global state machine log lock.  Assume that the calls to cvsema won't
+ * take a long time.  At least we know it won't sleep.
+ */
+void
+xlog_state_done_syncing(
+        xlog_in_core_t  *iclog,
+        int             aborted)
+{
+        xlog_t             *log = iclog->ic_log;
+        SPLDECL(s);
+        s = LOG_LOCK(log);
+        ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
+               iclog->ic_state == XLOG_STATE_IOERROR);
+        ASSERT(iclog->ic_refcnt == 0);
+        ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
+        /*
+         * If we got an error, either on the first buffer, or in the case of
+         * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
+         * and none should ever be attempted to be written to disk
+         * again.
+         */
+        if (iclog->ic_state != XLOG_STATE_IOERROR) {
+                if (--iclog->ic_bwritecnt == 1) {
+                        LOG_UNLOCK(log, s);
+                        return;
+                }
+                iclog->ic_state = XLOG_STATE_DONE_SYNC;
+        }
+        /*
+         * Someone could be sleeping prior to writing out the next
+         * iclog buffer, we wake them all, one will get to do the
+         * I/O, the others get to wait for the result.
+         */
+        sv_broadcast(&iclog->ic_writesema);
+        LOG_UNLOCK(log, s);
+        xlog_state_do_callback(log, aborted, iclog);    /* also cleans log */
+}       /* xlog_state_done_syncing */
+/*
+ * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
+ * sleep.  The flush semaphore is set to the number of in-core buffers and
+ * decremented around disk syncing.  Therefore, if all buffers are syncing,
+ * this semaphore will cause new writes to sleep until a sync completes.
+ * Otherwise, this code just does p() followed by v().  This approximates
+ * a sleep/wakeup except we can't race.
+ *
+ * The in-core logs are used in a circular fashion. They are not used
+ * out-of-order even when an iclog past the head is free.
+ *
+ * return:
+ *      * log_offset where xlog_write() can start writing into the in-core
+ *              log's data space.
+ *      * in-core log pointer to which xlog_write() should write.
+ *      * boolean indicating this is a continued write to an in-core log.
+ *              If this is the last write, then the in-core log's offset field
+ *              needs to be incremented, depending on the amount of data which
+ *              is copied.
+ */
+int
+xlog_state_get_iclog_space(xlog_t         *log,
+                           int            len,
+                           xlog_in_core_t **iclogp,
+                           xlog_ticket_t  *ticket,
+                           int            *continued_write,
+                           int            *logoffsetp)
+{
+        SPLDECL(s);
+        int               log_offset;
+        xlog_rec_header_t *head;
+        xlog_in_core_t    *iclog;
+        int               error;
+restart:
+        s = LOG_LOCK(log);
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                LOG_UNLOCK(log, s);
+                return XFS_ERROR(EIO);
+        }
+        iclog = log->l_iclog;
+        if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
+                log->l_flushcnt++;
+                LOG_UNLOCK(log, s);
+                xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
+                XFS_STATS_INC(xs_log_noiclogs);
+                /* Ensure that log writes happen */
+                psema(&log->l_flushsema, PINOD);
+                goto restart;
+        }
+        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+        head = &iclog->ic_header;
+        iclog->ic_refcnt++;                     /* prevents sync */
+        log_offset = iclog->ic_offset;
+        /* On the 1st write to an iclog, figure out lsn.  This works
+         * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
+         * committing to.  If the offset is set, that's how many blocks
+         * must be written.
+         */
+        if (log_offset == 0) {
+                ticket->t_curr_res -= log->l_iclog_hsize;
+                INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
+                ASSIGN_LSN(head->h_lsn, log);
+                ASSERT(log->l_curr_block >= 0);
+        }
+        /* If there is enough room to write everything, then do it.  Otherwise,
+         * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
+         * bit is on, so this will get flushed out.  Don't update ic_offset
+         * until you know exactly how many bytes get copied.  Therefore, wait
+         * until later to update ic_offset.
+         *
+         * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+         * can fit into remaining data section.
+         */
+        if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+                xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+                /* If I'm the only one writing to this iclog, sync it to disk */
+                if (iclog->ic_refcnt == 1) {
+                        LOG_UNLOCK(log, s);
+                        if ((error = xlog_state_release_iclog(log, iclog)))
+                                return (error);
+                } else {
+                        iclog->ic_refcnt--;
+                        LOG_UNLOCK(log, s);
+                }
+                goto restart;
+        }
+        /* Do we have enough room to write the full amount in the remainder
+         * of this iclog?  Or must we continue a write on the next iclog and
+         * mark this iclog as completely taken?  In the case where we switch
+         * iclogs (to mark it taken), this particular iclog will release/sync
+         * to disk in xlog_write().
+         */
+        if (len <= iclog->ic_size - iclog->ic_offset) {
+                *continued_write = 0;
+                iclog->ic_offset += len;
+        } else {
+                *continued_write = 1;
+                xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+        }
+        *iclogp = iclog;
+        ASSERT(iclog->ic_offset <= iclog->ic_size);
+        LOG_UNLOCK(log, s);
+        *logoffsetp = log_offset;
+        return 0;
+}       /* xlog_state_get_iclog_space */
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto the reserveq, it will only return after
+ * the needed reservation is satisfied.
+ */
+STATIC int
+xlog_grant_log_space(xlog_t        *log,
+                     xlog_ticket_t *tic)
+{
+        int              free_bytes;
+        int              need_bytes;
+        SPLDECL(s);
+#ifdef DEBUG
+        xfs_lsn_t        tail_lsn;
+#endif
+#ifdef DEBUG
+        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+                panic("grant Recovery problem");
+#endif
+        /* Is there space or do we need to sleep? */
+        s = GRANT_LOCK(log);
+        xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
+        /* something is already sleeping; insert new transaction at end */
+        if (log->l_reserve_headq) {
+                XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+                xlog_trace_loggrant(log, tic,
+                                    "xlog_grant_log_space: sleep 1");
+                /*
+                 * Gotta check this before going to sleep, while we're
+                 * holding the grant lock.
+                 */
+                if (XLOG_FORCED_SHUTDOWN(log))
+                        goto error_return;
+                XFS_STATS_INC(xs_sleep_logspace);
+                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                /*
+                 * If we got an error, and the filesystem is shutting down,
+                 * we'll catch it down below. So just continue...
+                 */
+                xlog_trace_loggrant(log, tic,
+                                    "xlog_grant_log_space: wake 1");
+                s = GRANT_LOCK(log);
+        }
+        if (tic->t_flags & XFS_LOG_PERM_RESERV)
+                need_bytes = tic->t_unit_res*tic->t_ocnt;
+        else
+                need_bytes = tic->t_unit_res;
+redo:
+        if (XLOG_FORCED_SHUTDOWN(log))
+                goto error_return;
+        free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+                                     log->l_grant_reserve_bytes);
+        if (free_bytes < need_bytes) {
+                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                        XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+                xlog_trace_loggrant(log, tic,
+                                    "xlog_grant_log_space: sleep 2");
+                XFS_STATS_INC(xs_sleep_logspace);
+                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                if (XLOG_FORCED_SHUTDOWN(log)) {
+                        s = GRANT_LOCK(log);
+                        goto error_return;
+                }
+                xlog_trace_loggrant(log, tic,
+                                    "xlog_grant_log_space: wake 2");
+                xlog_grant_push_ail(log->l_mp, need_bytes);
+                s = GRANT_LOCK(log);
+                goto redo;
+        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+                XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+        /* we've got enough space */
+        XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w');
+        XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
+#ifdef DEBUG
+        tail_lsn = log->l_tail_lsn;
+        /*
+         * Check to make sure the grant write head didn't just over lap the
+         * tail.  If the cycles are the same, we can't be overlapping.
+         * Otherwise, make sure that the cycles differ by exactly one and
+         * check the byte count.
+         */
+        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
+                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
+                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+        }
+#endif
+        xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
+        xlog_verify_grant_head(log, 1);
+        GRANT_UNLOCK(log, s);
+        return 0;
+ error_return:
+        if (tic->t_flags & XLOG_TIC_IN_Q)
+                XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+        xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
+        /*
+         * If we are failing, make sure the ticket doesn't have any
+         * current reservations. We don't want to add this back when
+         * the ticket/transaction gets cancelled.
+         */
+        tic->t_curr_res = 0;
+        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+        GRANT_UNLOCK(log, s);
+        return XFS_ERROR(EIO);
+}       /* xlog_grant_log_space */
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ *
+ *
+ */
+STATIC int
+xlog_regrant_write_log_space(xlog_t        *log,
+                             xlog_ticket_t *tic)
+{
+        SPLDECL(s);
+        int             free_bytes, need_bytes;
+        xlog_ticket_t   *ntic;
+#ifdef DEBUG
+        xfs_lsn_t       tail_lsn;
+#endif
+        tic->t_curr_res = tic->t_unit_res;
+        if (tic->t_cnt > 0)
+                return (0);
+#ifdef DEBUG
+        if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+                panic("regrant Recovery problem");
+#endif
+        s = GRANT_LOCK(log);
+        xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
+        if (XLOG_FORCED_SHUTDOWN(log))
+                goto error_return;
+        /* If there are other waiters on the queue then give them a
+         * chance at logspace before us. Wake up the first waiters,
+         * if we do not wake up all the waiters then go to sleep waiting
+         * for more free space, otherwise try to get some space for
+         * this transaction.
+         */
+        if ((ntic = log->l_write_headq)) {
+                free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                                             log->l_grant_write_bytes);
+                do {
+                        ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
+                        if (free_bytes < ntic->t_unit_res)
+                                break;
+                        free_bytes -= ntic->t_unit_res;
+                        sv_signal(&ntic->t_sema);
+                        ntic = ntic->t_next;
+                } while (ntic != log->l_write_headq);
+                if (ntic != log->l_write_headq) {
+                        if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                                XLOG_INS_TICKETQ(log->l_write_headq, tic);
+                        xlog_trace_loggrant(log, tic,
+                                    "xlog_regrant_write_log_space: sleep 1");
+                        XFS_STATS_INC(xs_sleep_logspace);
+                        sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+                                &log->l_grant_lock, s);
+                        /* If we're shutting down, this tic is already
+                         * off the queue */
+                        if (XLOG_FORCED_SHUTDOWN(log)) {
+                                s = GRANT_LOCK(log);
+                                goto error_return;
+                        }
+                        xlog_trace_loggrant(log, tic,
+                                    "xlog_regrant_write_log_space: wake 1");
+                        xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
+                        s = GRANT_LOCK(log);
+                }
+        }
+        need_bytes = tic->t_unit_res;
+redo:
+        if (XLOG_FORCED_SHUTDOWN(log))
+                goto error_return;
+        free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+                                     log->l_grant_write_bytes);
+        if (free_bytes < need_bytes) {
+                if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+                        XLOG_INS_TICKETQ(log->l_write_headq, tic);
+                XFS_STATS_INC(xs_sleep_logspace);
+                sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+                /* If we're shutting down, this tic is already off the queue */
+                if (XLOG_FORCED_SHUTDOWN(log)) {
+                        s = GRANT_LOCK(log);
+                        goto error_return;
+                }
+                xlog_trace_loggrant(log, tic,
+                                    "xlog_regrant_write_log_space: wake 2");
+                xlog_grant_push_ail(log->l_mp, need_bytes);
+                s = GRANT_LOCK(log);
+                goto redo;
+        } else if (tic->t_flags & XLOG_TIC_IN_Q)
+                XLOG_DEL_TICKETQ(log->l_write_headq, tic);
+        XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */
+#ifdef DEBUG
+        tail_lsn = log->l_tail_lsn;
+        if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) {
+                ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn));
+                ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn)));
+        }
+#endif
+        xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
+        xlog_verify_grant_head(log, 1);
+        GRANT_UNLOCK(log, s);
+        return (0);
+ error_return:
+        if (tic->t_flags & XLOG_TIC_IN_Q)
+                XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+        xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
+        /*
+         * If we are failing, make sure the ticket doesn't have any
+         * current reservations. We don't want to add this back when
+         * the ticket/transaction gets cancelled.
+         */
+        tic->t_curr_res = 0;
+        tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+        GRANT_UNLOCK(log, s);
+        return XFS_ERROR(EIO);
+}       /* xlog_regrant_write_log_space */
+/* The first cnt-1 times through here we don't need to
+ * move the grant write head because the permanent
+ * reservation has reserved cnt times the unit amount.
+ * Release part of current permanent unit reservation and
+ * reset current reservation to be one units worth.  Also
+ * move grant reservation head forward.
+ */
+STATIC void
+xlog_regrant_reserve_log_space(xlog_t        *log,
+                               xlog_ticket_t *ticket)
+{
+        SPLDECL(s);
+        xlog_trace_loggrant(log, ticket,
+                            "xlog_regrant_reserve_log_space: enter");
+        if (ticket->t_cnt > 0)
+                ticket->t_cnt--;
+        s = GRANT_LOCK(log);
+        XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
+        XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+        ticket->t_curr_res = ticket->t_unit_res;
+        xlog_trace_loggrant(log, ticket,
+                            "xlog_regrant_reserve_log_space: sub current res");
+        xlog_verify_grant_head(log, 1);
+        /* just return if we still have some of the pre-reserved space */
+        if (ticket->t_cnt > 0) {
+                GRANT_UNLOCK(log, s);
+                return;
+        }
+        XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r');
+        xlog_trace_loggrant(log, ticket,
+                            "xlog_regrant_reserve_log_space: exit");
+        xlog_verify_grant_head(log, 0);
+        GRANT_UNLOCK(log, s);
+        ticket->t_curr_res = ticket->t_unit_res;
+}       /* xlog_regrant_reserve_log_space */
+/*
+ * Give back the space left from a reservation.
+ *
+ * All the information we need to make a correct determination of space left
+ * is present.  For non-permanent reservations, things are quite easy.  The
+ * count should have been decremented to zero.  We only need to deal with the
+ * space remaining in the current reservation part of the ticket.  If the
+ * ticket contains a permanent reservation, there may be left over space which
+ * needs to be released.  A count of N means that N-1 refills of the current
+ * reservation can be done before we need to ask for more space.  The first
+ * one goes to fill up the first current reservation.  Once we run out of
+ * space, the count will stay at zero and the only space remaining will be
+ * in the current reservation field.
+ */
+STATIC void
+xlog_ungrant_log_space(xlog_t        *log,
+                       xlog_ticket_t *ticket)
+{
+        SPLDECL(s);
+        if (ticket->t_cnt > 0)
+                ticket->t_cnt--;
+        s = GRANT_LOCK(log);
+        xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
+        XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
+        XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+        xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
+        /* If this is a permanent reservation ticket, we may be able to free
+         * up more space based on the remaining count.
+         */
+        if (ticket->t_cnt > 0) {
+                ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
+                XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w');
+                XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
+        }
+        xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
+        xlog_verify_grant_head(log, 1);
+        GRANT_UNLOCK(log, s);
+        xfs_log_move_tail(log->l_mp, 1);
+}       /* xlog_ungrant_log_space */
+/*
+ * Atomically put back used ticket.
+ */
+void
+xlog_state_put_ticket(xlog_t        *log,
+                      xlog_ticket_t *tic)
+{
+        unsigned long s;
+        s = LOG_LOCK(log);
+        xlog_ticket_put(log, tic);
+        LOG_UNLOCK(log, s);
+}       /* xlog_state_put_ticket */
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and
+ * the WANT_SYNC bit is set.
+ *
+ * When this function is entered, the iclog is not necessarily in the
+ * WANT_SYNC state.  It may be sitting around waiting to get filled.
+ *
+ *
+ */
+int
+xlog_state_release_iclog(xlog_t         *log,
+                         xlog_in_core_t *iclog)
+{
+        SPLDECL(s);
+        int             sync = 0;       /* do we sync? */
+        xlog_assign_tail_lsn(log->l_mp);
+        s = LOG_LOCK(log);
+        if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                LOG_UNLOCK(log, s);
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(iclog->ic_refcnt > 0);
+        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
+               iclog->ic_state == XLOG_STATE_WANT_SYNC);
+        if (--iclog->ic_refcnt == 0 &&
+            iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+                sync++;
+                iclog->ic_state = XLOG_STATE_SYNCING;
+                INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn);
+                xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+                /* cycle incremented when incrementing curr_block */
+        }
+        LOG_UNLOCK(log, s);
+        /*
+         * We let the log lock go, so it's possible that we hit a log I/O
+         * error or someother SHUTDOWN condition that marks the iclog
+         * as XLOG_STATE_IOERROR before the bwrite. However, we know that
+         * this iclog has consistent data, so we ignore IOERROR
+         * flags after this point.
+         */
+        if (sync) {
+                return xlog_sync(log, iclog);
+        }
+        return (0);
+}       /* xlog_state_release_iclog */
+/*
+ * This routine will mark the current iclog in the ring as WANT_SYNC
+ * and move the current iclog pointer to the next iclog in the ring.
+ * When this routine is called from xlog_state_get_iclog_space(), the
+ * exact size of the iclog has not yet been determined.  All we know is
+ * that every data block.  We have run out of space in this log record.
+ */
+STATIC void
+xlog_state_switch_iclogs(xlog_t         *log,
+                         xlog_in_core_t *iclog,
+                         int            eventual_size)
+{
+        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+        if (!eventual_size)
+                eventual_size = iclog->ic_offset;
+        iclog->ic_state = XLOG_STATE_WANT_SYNC;
+        INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block);
+        log->l_prev_block = log->l_curr_block;
+        log->l_prev_cycle = log->l_curr_cycle;
+        /* roll log?: ic_offset changed later */
+        log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
+        /* Round up to next log-sunit */
+        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
+            log->l_mp->m_sb.sb_logsunit > 1) {
+                __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit);
+                log->l_curr_block = roundup(log->l_curr_block, sunit_bb);
+        }
+        if (log->l_curr_block >= log->l_logBBsize) {
+                log->l_curr_cycle++;
+                if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
+                        log->l_curr_cycle++;
+                log->l_curr_block -= log->l_logBBsize;
+                ASSERT(log->l_curr_block >= 0);
+        }
+        ASSERT(iclog == log->l_iclog);
+        log->l_iclog = iclog->ic_next;
+}       /* xlog_state_switch_iclogs */
+/*
+ * Write out all data in the in-core log as of this exact moment in time.
+ *
+ * Data may be written to the in-core log during this call.  However,
+ * we don't guarantee this data will be written out.  A change from past
+ * implementation means this routine will *not* write out zero length LRs.
+ *
+ * Basically, we try and perform an intelligent scan of the in-core logs.
+ * If we determine there is no flushable data, we just return.  There is no
+ * flushable data if:
+ *
+ *      1. the current iclog is active and has no data; the previous iclog
+ *              is in the active or dirty state.
+ *      2. the current iclog is drity, and the previous iclog is in the
+ *              active or dirty state.
+ *
+ * We may sleep (call psema) if:
+ *
+ *      1. the current iclog is not in the active nor dirty state.
+ *      2. the current iclog dirty, and the previous iclog is not in the
+ *              active nor dirty state.
+ *      3. the current iclog is active, and there is another thread writing
+ *              to this particular iclog.
+ *      4. a) the current iclog is active and has no other writers
+ *         b) when we return from flushing out this iclog, it is still
+ *              not in the active nor dirty state.
+ */
+STATIC int
+xlog_state_sync_all(xlog_t *log, uint flags)
+{
+        xlog_in_core_t  *iclog;
+        xfs_lsn_t       lsn;
+        SPLDECL(s);
+        s = LOG_LOCK(log);
+        iclog = log->l_iclog;
+        if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                LOG_UNLOCK(log, s);
+                return XFS_ERROR(EIO);
+        }
+        /* If the head iclog is not active nor dirty, we just attach
+         * ourselves to the head and go to sleep.
+         */
+        if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+            iclog->ic_state == XLOG_STATE_DIRTY) {
+                /*
+                 * If the head is dirty or (active and empty), then
+                 * we need to look at the previous iclog.  If the previous
+                 * iclog is active or dirty we are done.  There is nothing
+                 * to sync out.  Otherwise, we attach ourselves to the
+                 * previous iclog and go to sleep.
+                 */
+                if (iclog->ic_state == XLOG_STATE_DIRTY ||
+                    (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+                        iclog = iclog->ic_prev;
+                        if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+                            iclog->ic_state == XLOG_STATE_DIRTY)
+                                goto no_sleep;
+                        else
+                                goto maybe_sleep;
+                } else {
+                        if (iclog->ic_refcnt == 0) {
+                                /* We are the only one with access to this
+                                 * iclog.  Flush it out now.  There should
+                                 * be a roundoff of zero to show that someone
+                                 * has already taken care of the roundoff from
+                                 * the previous sync.
+                                 */
+                                iclog->ic_refcnt++;
+                                lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+                                xlog_state_switch_iclogs(log, iclog, 0);
+                                LOG_UNLOCK(log, s);
+                                if (xlog_state_release_iclog(log, iclog))
+                                        return XFS_ERROR(EIO);
+                                s = LOG_LOCK(log);
+                                if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
+                                    iclog->ic_state != XLOG_STATE_DIRTY)
+                                        goto maybe_sleep;
+                                else
+                                        goto no_sleep;
+                        } else {
+                                /* Someone else is writing to this iclog.
+                                 * Use its call to flush out the data.  However,
+                                 * the other thread may not force out this LR,
+                                 * so we mark it WANT_SYNC.
+                                 */
+                                xlog_state_switch_iclogs(log, iclog, 0);
+                                goto maybe_sleep;
+                        }
+                }
+        }
+        /* By the time we come around again, the iclog could've been filled
+         * which would give it another lsn.  If we have a new lsn, just
+         * return because the relevant data has been flushed.
+         */
+maybe_sleep:
+        if (flags & XFS_LOG_SYNC) {
+                /*
+                 * We must check if we're shutting down here, before
+                 * we wait, while we're holding the LOG_LOCK.
+                 * Then we check again after waking up, in case our
+                 * sleep was disturbed by a bad news.
+                 */
+                if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                        LOG_UNLOCK(log, s);
+                        return XFS_ERROR(EIO);
+                }
+                XFS_STATS_INC(xs_log_force_sleep);
+                sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+                /*
+                 * No need to grab the log lock here since we're
+                 * only deciding whether or not to return EIO
+                 * and the memory read should be atomic.
+                 */
+                if (iclog->ic_state & XLOG_STATE_IOERROR)
+                        return XFS_ERROR(EIO);
+        } else {
+no_sleep:
+                LOG_UNLOCK(log, s);
+        }
+        return 0;
+}       /* xlog_state_sync_all */
+/*
+ * Used by code which implements synchronous log forces.
+ *
+ * Find in-core log with lsn.
+ *      If it is in the DIRTY state, just return.
+ *      If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ *              state and go to sleep or return.
+ *      If it is in any other state, go to sleep or return.
+ *
+ * If filesystem activity goes to zero, the iclog will get flushed only by
+ * bdflush().
+ */
+int
+xlog_state_sync(xlog_t    *log,
+                xfs_lsn_t lsn,
+                uint      flags)
+{
+    xlog_in_core_t      *iclog;
+    int                 already_slept = 0;
+    SPLDECL(s);
+try_again:
+    s = LOG_LOCK(log);
+    iclog = log->l_iclog;
+    if (iclog->ic_state & XLOG_STATE_IOERROR) {
+            LOG_UNLOCK(log, s);
+            return XFS_ERROR(EIO);
+    }
+    do {
+        if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
+            iclog = iclog->ic_next;
+            continue;
+        }
+        if (iclog->ic_state == XLOG_STATE_DIRTY) {
+                LOG_UNLOCK(log, s);
+                return 0;
+        }
+        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+                /*
+                 * We sleep here if we haven't already slept (e.g.
+                 * this is the first time we've looked at the correct
+                 * iclog buf) and the buffer before us is going to
+                 * be sync'ed. The reason for this is that if we
+                 * are doing sync transactions here, by waiting for
+                 * the previous I/O to complete, we can allow a few
+                 * more transactions into this iclog before we close
+                 * it down.
+                 *
+                 * Otherwise, we mark the buffer WANT_SYNC, and bump
+                 * up the refcnt so we can release the log (which drops
+                 * the ref count).  The state switch keeps new transaction
+                 * commits from using this buffer.  When the current commits
+                 * finish writing into the buffer, the refcount will drop to
+                 * zero and the buffer will go out then.
+                 */
+                if (!already_slept &&
+                    (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC |
+                                                 XLOG_STATE_SYNCING))) {
+                        ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+                        XFS_STATS_INC(xs_log_force_sleep);
+                        sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
+                                &log->l_icloglock, s);
+                        already_slept = 1;
+                        goto try_again;
+                } else {
+                        iclog->ic_refcnt++;
+                        xlog_state_switch_iclogs(log, iclog, 0);
+                        LOG_UNLOCK(log, s);
+                        if (xlog_state_release_iclog(log, iclog))
+                                return XFS_ERROR(EIO);
+                        s = LOG_LOCK(log);
+                }
+        }
+        if ((flags & XFS_LOG_SYNC) && /* sleep */
+            !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+                /*
+                 * Don't wait on the forcesema if we know that we've
+                 * gotten a log write error.
+                 */
+                if (iclog->ic_state & XLOG_STATE_IOERROR) {
+                        LOG_UNLOCK(log, s);
+                        return XFS_ERROR(EIO);
+                }
+                XFS_STATS_INC(xs_log_force_sleep);
+                sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+                /*
+                 * No need to grab the log lock here since we're
+                 * only deciding whether or not to return EIO
+                 * and the memory read should be atomic.
+                 */
+                if (iclog->ic_state & XLOG_STATE_IOERROR)
+                        return XFS_ERROR(EIO);
+        } else {                /* just return */
+                LOG_UNLOCK(log, s);
+        }
+        return 0;
+    } while (iclog != log->l_iclog);
+    LOG_UNLOCK(log, s);
+    return (0);
+}       /* xlog_state_sync */
+/*
+ * Called when we want to mark the current iclog as being ready to sync to
+ * disk.
+ */
+void
+xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
+{
+        SPLDECL(s);
+        s = LOG_LOCK(log);
+        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+                xlog_state_switch_iclogs(log, iclog, 0);
+        } else {
+                ASSERT(iclog->ic_state &
+                        (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
+        }
+        LOG_UNLOCK(log, s);
+}       /* xlog_state_want_sync */
+/*****************************************************************************
+ *
+ *              TICKET functions
+ *
+ *****************************************************************************
+ */
+/*
+ *      Algorithm doesn't take into account page size. ;-(
+ */
+STATIC void
+xlog_state_ticket_alloc(xlog_t *log)
+{
+        xlog_ticket_t   *t_list;
+        xlog_ticket_t   *next;
+        xfs_caddr_t     buf;
+        uint            i = (NBPP / sizeof(xlog_ticket_t)) - 2;
+        SPLDECL(s);
+        /*
+         * The kmem_zalloc may sleep, so we shouldn't be holding the
+         * global lock.  XXXmiken: may want to use zone allocator.
+         */
+        buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP);
+        s = LOG_LOCK(log);
+        /* Attach 1st ticket to Q, so we can keep track of allocated memory */
+        t_list = (xlog_ticket_t *)buf;
+        t_list->t_next = log->l_unmount_free;
+        log->l_unmount_free = t_list++;
+        log->l_ticket_cnt++;
+        log->l_ticket_tcnt++;
+        /* Next ticket becomes first ticket attached to ticket free list */
+        if (log->l_freelist != NULL) {
+                ASSERT(log->l_tail != NULL);
+                log->l_tail->t_next = t_list;
+        } else {
+                log->l_freelist = t_list;
+        }
+        log->l_ticket_cnt++;
+        log->l_ticket_tcnt++;
+        /* Cycle through rest of alloc'ed memory, building up free Q */
+        for ( ; i > 0; i--) {
+                next = t_list + 1;
+                t_list->t_next = next;
+                t_list = next;
+                log->l_ticket_cnt++;
+                log->l_ticket_tcnt++;
+        }
+        t_list->t_next = NULL;
+        log->l_tail = t_list;
+        LOG_UNLOCK(log, s);
+}       /* xlog_state_ticket_alloc */
+/*
+ * Put ticket into free list
+ *
+ * Assumption: log lock is held around this call.
+ */
+STATIC void
+xlog_ticket_put(xlog_t          *log,
+                xlog_ticket_t   *ticket)
+{
+        sv_destroy(&ticket->t_sema);
+        /*
+         * Don't think caching will make that much difference.  It's
+         * more important to make debug easier.
+         */
+#if 0
+        /* real code will want to use LIFO for caching */
+        ticket->t_next = log->l_freelist;
+        log->l_freelist = ticket;
+        /* no need to clear fields */
+#else
+        /* When we debug, it is easier if tickets are cycled */
+        ticket->t_next     = NULL;
+        if (log->l_tail != 0) {
+                log->l_tail->t_next = ticket;
+        } else {
+                ASSERT(log->l_freelist == 0);
+                log->l_freelist = ticket;
+        }
+        log->l_tail         = ticket;
+#endif /* DEBUG */
+        log->l_ticket_cnt++;
+}       /* xlog_ticket_put */
+/*
+ * Grab ticket off freelist or allocation some more
+ */
+xlog_ticket_t *
+xlog_ticket_get(xlog_t          *log,
+                int             unit_bytes,
+                int             cnt,
+                char            client,
+                uint            xflags)
+{
+        xlog_ticket_t   *tic;
+        uint            num_headers;
+        SPLDECL(s);
+ alloc:
+        if (log->l_freelist == NULL)
+                xlog_state_ticket_alloc(log);           /* potentially sleep */
+        s = LOG_LOCK(log);
+        if (log->l_freelist == NULL) {
+                LOG_UNLOCK(log, s);
+                goto alloc;
+        }
+        tic             = log->l_freelist;
+        log->l_freelist = tic->t_next;
+        if (log->l_freelist == NULL)
+                log->l_tail = NULL;
+        log->l_ticket_cnt--;
+        LOG_UNLOCK(log, s);
+        /*
+         * Permanent reservations have up to 'cnt'-1 active log operations
+         * in the log.  A unit in this case is the amount of space for one
+         * of these log operations.  Normal reservations have a cnt of 1
+         * and their unit amount is the total amount of space required.
+         *
+         * The following lines of code account for non-transaction data
+         * which occupy space in the on-disk log. 
+         */
+        /* for start-rec */
+        unit_bytes += sizeof(xlog_op_header_t); 
+        /* for padding */
+        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
+                log->l_mp->m_sb.sb_logsunit > 1) {
+                /* log su roundoff */
+                unit_bytes += log->l_mp->m_sb.sb_logsunit;  
+        } else {
+                /* BB roundoff */
+                unit_bytes += BBSIZE;
+        }
+        /* for commit-rec */
+        unit_bytes += sizeof(xlog_op_header_t);
+ 
+        /* for LR headers */
+        num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+        unit_bytes += log->l_iclog_hsize * num_headers;
+        tic->t_unit_res         = unit_bytes;
+        tic->t_curr_res         = unit_bytes;
+        tic->t_cnt              = cnt;
+        tic->t_ocnt             = cnt;
+        tic->t_tid              = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+        tic->t_clientid         = client;
+        tic->t_flags            = XLOG_TIC_INITED;
+        if (xflags & XFS_LOG_PERM_RESERV)
+                tic->t_flags |= XLOG_TIC_PERM_RESERV;
+        sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+        return tic;
+}       /* xlog_ticket_get */
+/******************************************************************************
+ *
+ *              Log debug routines
+ *
+ ******************************************************************************
+ */
+#if defined(DEBUG) && !defined(XLOG_NOLOG)
+/*
+ * Make sure that the destination ptr is within the valid data region of
+ * one of the iclogs.  This uses backup pointers stored in a different
+ * part of the log in case we trash the log structure.
+ */
+void
+xlog_verify_dest_ptr(xlog_t     *log,
+                     __psint_t  ptr)
+{
+        int i;
+        int good_ptr = 0;
+        for (i=0; i < log->l_iclog_bufs; i++) {
+                if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
+                    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+                        good_ptr++;
+        }
+        if (! good_ptr)
+                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+}       /* xlog_verify_dest_ptr */
+STATIC void
+xlog_verify_grant_head(xlog_t *log, int equals)
+{
+    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+        if (equals)
+            ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
+        else
+            ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+    } else {
+        ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+        ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+    }
+}       /* xlog_verify_grant_head */
+/* check if it will fit */
+STATIC void
+xlog_verify_tail_lsn(xlog_t         *log,
+                     xlog_in_core_t *iclog,
+                     xfs_lsn_t      tail_lsn)
+{
+    int blocks;
+    if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
+        blocks =
+            log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
+        if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
+            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+    } else {
+        ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
+        if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
+            xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+        blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
+        if (blocks < BTOBB(iclog->ic_offset) + 1)
+            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+    }
+}       /* xlog_verify_tail_lsn */
+/*
+ * Perform a number of checks on the iclog before writing to disk.
+ *
+ * 1. Make sure the iclogs are still circular
+ * 2. Make sure we have a good magic number
+ * 3. Make sure we don't have magic numbers in the data
+ * 4. Check fields of each log operation header for:
+ *      A. Valid client identifier
+ *      B. tid ptr value falls in valid ptr space (user space code)
+ *      C. Length in log record header is correct according to the
+ *              individual operation headers within record.
+ * 5. When a bwrite will occur within 5 blocks of the front of the physical
+ *      log, check the preceding blocks of the physical log to make sure all
+ *      the cycle numbers agree with the current cycle number.
+ */
+STATIC void
+xlog_verify_iclog(xlog_t         *log,
+                  xlog_in_core_t *iclog,
+                  int            count,
+                  boolean_t      syncing)
+{
+        xlog_op_header_t        *ophead;
+        xlog_in_core_t          *icptr;
+        xlog_in_core_2_t        *xhdr;
+        xfs_caddr_t             ptr;
+        xfs_caddr_t             base_ptr;
+        __psint_t               field_offset;
+        __uint8_t               clientid;
+        int                     len, i, j, k, op_len;
+        int                     idx;
+        SPLDECL(s);
+        /* check validity of iclog pointers */
+        s = LOG_LOCK(log);
+        icptr = log->l_iclog;
+        for (i=0; i < log->l_iclog_bufs; i++) {
+                if (icptr == 0)
+                        xlog_panic("xlog_verify_iclog: invalid ptr");
+                icptr = icptr->ic_next;
+        }
+        if (icptr != log->l_iclog)
+                xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+        LOG_UNLOCK(log, s);
+        /* check log magic numbers */
+        ptr = (xfs_caddr_t) &(iclog->ic_header);
+        if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM)
+                xlog_panic("xlog_verify_iclog: invalid magic num");
+        for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count;
+             ptr += BBSIZE) {
+                if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+                        xlog_panic("xlog_verify_iclog: unexpected magic num");
+        }
+        /* check fields */
+        len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT);
+        ptr = iclog->ic_datap;
+        base_ptr = ptr;
+        ophead = (xlog_op_header_t *)ptr;
+        xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+        for (i = 0; i < len; i++) {
+                ophead = (xlog_op_header_t *)ptr;
+                /* clientid is only 1 byte */
+                field_offset = (__psint_t)
+                               ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+                if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+                        clientid = ophead->oh_clientid;
+                } else {
+                        idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+                        if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+                                j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                                k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                                clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
+                        } else {
+                                clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
+                        }
+                }
+                if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+                        cmn_err(CE_WARN, "xlog_verify_iclog: invalid clientid %d op 0x%p offset 0x%x", clientid, ophead, field_offset);
+                /* check length */
+                field_offset = (__psint_t)
+                               ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+                if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+                        op_len = INT_GET(ophead->oh_len, ARCH_CONVERT);
+                } else {
+                        idx = BTOBBT((__psint_t)&ophead->oh_len -
+                                    (__psint_t)iclog->ic_datap);
+                        if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+                                j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                                k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                                op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
+                        } else {
+                                op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
+                        }
+                }
+                ptr += sizeof(xlog_op_header_t) + op_len;
+        }
+}       /* xlog_verify_iclog */
+#endif /* DEBUG && !XLOG_NOLOG */
+/*
+ * Mark all iclogs IOERROR. LOG_LOCK is held by the caller.
+ */
+STATIC int
+xlog_state_ioerror(
+        xlog_t  *log)
+{
+        xlog_in_core_t  *iclog, *ic;
+        iclog = log->l_iclog;
+        if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+                /*
+                 * Mark all the incore logs IOERROR.
+                 * From now on, no log flushes will result.
+                 */
+                ic = iclog;
+                do {
+                        ic->ic_state = XLOG_STATE_IOERROR;
+                        ic = ic->ic_next;
+                } while (ic != iclog);
+                return (0);
+        }
+        /*
+         * Return non-zero, if state transition has already happened.
+         */
+        return (1);
+}
+/*
+ * This is called from xfs_force_shutdown, when we're forcibly
+ * shutting down the filesystem, typically because of an IO error.
+ * Our main objectives here are to make sure that:
+ *      a. the filesystem gets marked 'SHUTDOWN' for all interested
+ *         parties to find out, 'atomically'.
+ *      b. those who're sleeping on log reservations, pinned objects and
+ *          other resources get woken up, and be told the bad news.
+ *      c. nothing new gets queued up after (a) and (b) are done.
+ *      d. if !logerror, flush the iclogs to disk, then seal them off
+ *         for business.
+ */
+int
+xfs_log_force_umount(
+        struct xfs_mount        *mp,
+        int                     logerror)
+{
+        xlog_ticket_t   *tic;
+        xlog_t          *log;
+        int             retval;
+        SPLDECL(s);
+        SPLDECL(s2);
+        log = mp->m_log;
+        /*
+         * If this happens during log recovery, don't worry about
+         * locking; the log isn't open for business yet.
+         */
+        if (!log ||
+            log->l_flags & XLOG_ACTIVE_RECOVERY) {
+                mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+                XFS_BUF_DONE(mp->m_sb_bp);
+                return (0);
+        }
+        /*
+         * Somebody could've already done the hard work for us.
+         * No need to get locks for this.
+         */
+        if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+                ASSERT(XLOG_FORCED_SHUTDOWN(log));
+                return (1);
+        }
+        retval = 0;
+        /*
+         * We must hold both the GRANT lock and the LOG lock,
+         * before we mark the filesystem SHUTDOWN and wake
+         * everybody up to tell the bad news.
+         */
+        s = GRANT_LOCK(log);
+        s2 = LOG_LOCK(log);
+        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+        XFS_BUF_DONE(mp->m_sb_bp);
+        /*
+         * This flag is sort of redundant because of the mount flag, but
+         * it's good to maintain the separation between the log and the rest
+         * of XFS.
+         */
+        log->l_flags |= XLOG_IO_ERROR;
+        /*
+         * If we hit a log error, we want to mark all the iclogs IOERROR
+         * while we're still holding the loglock.
+         */
+        if (logerror)
+                retval = xlog_state_ioerror(log);
+        LOG_UNLOCK(log, s2);
+        /*
+         * We don't want anybody waiting for log reservations
+         * after this. That means we have to wake up everybody
+         * queued up on reserve_headq as well as write_headq.
+         * In addition, we make sure in xlog_{re}grant_log_space
+         * that we don't enqueue anything once the SHUTDOWN flag
+         * is set, and this action is protected by the GRANTLOCK.
+         */
+        if ((tic = log->l_reserve_headq)) {
+                do {
+                        sv_signal(&tic->t_sema);
+                        tic = tic->t_next;
+                } while (tic != log->l_reserve_headq);
+        }
+        if ((tic = log->l_write_headq)) {
+                do {
+                        sv_signal(&tic->t_sema);
+                        tic = tic->t_next;
+                } while (tic != log->l_write_headq);
+        }
+        GRANT_UNLOCK(log, s);
+        if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+                ASSERT(!logerror);
+                /*
+                 * Force the incore logs to disk before shutting the
+                 * log down completely.
+                 */
+                xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC);
+                s2 = LOG_LOCK(log);
+                retval = xlog_state_ioerror(log);
+                LOG_UNLOCK(log, s2);
+        }
+        /*
+         * Wake up everybody waiting on xfs_log_force.
+         * Callback all log item committed functions as if the
+         * log writes were completed.
+         */
+        xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+#ifdef XFSERRORDEBUG
+        {
+                xlog_in_core_t  *iclog;
+                s = LOG_LOCK(log);
+                iclog = log->l_iclog;
+                do {
+                        ASSERT(iclog->ic_callback == 0);
+                        iclog = iclog->ic_next;
+                } while (iclog != log->l_iclog);
+                LOG_UNLOCK(log, s);
+        }
+#endif
+        /* return non-zero if log IOERROR transition had already happened */
+        return (retval);
+}
+int
+xlog_iclogs_empty(xlog_t *log)
+{
+        xlog_in_core_t  *iclog;
+        iclog = log->l_iclog;
+        do {
+                /* endianness does not matter here, zero is zero in
+                 * any language.
+                 */
+                if (iclog->ic_header.h_num_logops)
+                        return(0);
+                iclog = iclog->ic_next;
+        } while (iclog != log->l_iclog);
+        return(1);
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
new file mode 100644
index 000000000000..0db122ddda3f
--- /dev/null
+++ b/fs/xfs/xfs_log.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_H__
+#define __XFS_LOG_H__
+/* get lsn fields */
+#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
+#define BLOCK_LSN(lsn) ((uint)(lsn))
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_DISK(lsn) (((uint *)&(lsn))[0])
+#ifdef __KERNEL__
+/*
+ * By comparing each compnent, we don't have to worry about extra
+ * endian issues in treating two 32 bit numbers as one 64 bit number
+ */
+static
+#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96))
+__attribute__((unused)) /* gcc 2.95, 2.96 miscompile this when inlined */
+#else
+__inline__
+#endif
+xfs_lsn_t       _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
+{
+        if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2))
+                return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999;
+        if (BLOCK_LSN(lsn1) != BLOCK_LSN(lsn2))
+                return (BLOCK_LSN(lsn1)<BLOCK_LSN(lsn2))? -999 : 999;
+        return 0;
+}
+#define XFS_LSN_CMP(x,y) _lsn_cmp(x,y)
+/*
+ * Macros, structures, prototypes for interface to the log manager.
+ */
+/*
+ * Flags to xfs_log_mount
+ */
+#define XFS_LOG_RECOVER         0x1
+/*
+ * Flags to xfs_log_done()
+ */
+#define XFS_LOG_REL_PERM_RESERV 0x1
+/*
+ * Flags to xfs_log_reserve()
+ *
+ *      XFS_LOG_SLEEP:   If space is not available, sleep (default)
+ *      XFS_LOG_NOSLEEP: If space is not available, return error
+ *      XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
+ *              performed against this type of reservation, the reservation
+ *              is not decreased.  Long running transactions should use this.
+ */
+#define XFS_LOG_SLEEP           0x0
+#define XFS_LOG_NOSLEEP         0x1
+#define XFS_LOG_PERM_RESERV     0x2
+#define XFS_LOG_RESV_ALL        (XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV)
+/*
+ * Flags to xfs_log_force()
+ *
+ *      XFS_LOG_SYNC:   Synchronous force in-core log to disk
+ *      XFS_LOG_FORCE:  Start in-core log write now.
+ *      XFS_LOG_URGE:   Start write within some window of time.
+ *
+ * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
+ */
+#define XFS_LOG_SYNC            0x1
+#define XFS_LOG_FORCE           0x2
+#define XFS_LOG_URGE            0x4
+#endif  /* __KERNEL__ */
+/* Log Clients */
+#define XFS_TRANSACTION         0x69
+#define XFS_VOLUME              0x2
+#define XFS_LOG                 0xaa
+typedef struct xfs_log_iovec {
+        xfs_caddr_t             i_addr;         /* beginning address of region */
+        int             i_len;          /* length in bytes of region */
+} xfs_log_iovec_t;
+typedef void* xfs_log_ticket_t;
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+        struct xfs_log_callback *cb_next;
+        void                    (*cb_func)(void *, int);
+        void                    *cb_arg;
+} xfs_log_callback_t;
+#ifdef __KERNEL__
+/* Log manager interfaces */
+struct xfs_mount;
+xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
+                       xfs_log_ticket_t ticket,
+                       void             **iclog,
+                       uint             flags);
+int       xfs_log_force(struct xfs_mount *mp,
+                        xfs_lsn_t        lsn,
+                        uint             flags);
+int       xfs_log_mount(struct xfs_mount        *mp,
+                        struct xfs_buftarg      *log_target,
+                        xfs_daddr_t             start_block,
+                        int                     num_bblocks);
+int       xfs_log_mount_finish(struct xfs_mount *mp, int);
+void      xfs_log_move_tail(struct xfs_mount    *mp,
+                            xfs_lsn_t           tail_lsn);
+int       xfs_log_notify(struct xfs_mount       *mp,
+                         void                   *iclog,
+                         xfs_log_callback_t     *callback_entry);
+int       xfs_log_release_iclog(struct xfs_mount *mp,
+                         void                    *iclog_hndl);
+int       xfs_log_reserve(struct xfs_mount *mp,
+                          int              length,
+                          int              count,
+                          xfs_log_ticket_t *ticket,
+                          __uint8_t        clientid,
+                          uint             flags);
+int       xfs_log_write(struct xfs_mount *mp,
+                        xfs_log_iovec_t  region[],
+                        int              nentries,
+                        xfs_log_ticket_t ticket,
+                        xfs_lsn_t        *start_lsn);
+int       xfs_log_unmount(struct xfs_mount *mp);
+int       xfs_log_unmount_write(struct xfs_mount *mp);
+void      xfs_log_unmount_dealloc(struct xfs_mount *mp);
+int       xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+int       xfs_log_need_covered(struct xfs_mount *mp);
+void      xlog_iodone(struct xfs_buf *);
+#endif
+extern int xlog_debug;          /* set to 1 to enable real log */
+#endif  /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
new file mode 100644
index 000000000000..c31e3ce3be66
--- /dev/null
+++ b/fs/xfs/xfs_log_priv.h
@@ -0,0 +1,561 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_PRIV_H__
+#define __XFS_LOG_PRIV_H__
+struct xfs_buf;
+struct ktrace;
+struct log;
+struct xfs_buf_cancel;
+struct xfs_mount;
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+#define XLOG_MIN_ICLOGS         2
+#define XLOG_MED_ICLOGS         4
+#define XLOG_MAX_ICLOGS         8
+#define XLOG_CALLBACK_SIZE      10
+#define XLOG_HEADER_MAGIC_NUM   0xFEEDbabe      /* Invalid cycle number */
+#define XLOG_VERSION_1          1
+#define XLOG_VERSION_2          2               /* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS     (XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_RECORD_BSIZE       (16*1024)       /* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE   (32*1024)       /* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE   (256*1024)
+#define XLOG_HEADER_CYCLE_SIZE  (32*1024)       /* cycle data in header */
+#define XLOG_RECORD_BSHIFT      14              /* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT  15              /* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT  18              /* 256k == 1 << 18 */
+#define XLOG_BTOLSUNIT(log, b)  (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
+                                 (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
+#define XLOG_HEADER_SIZE        512
+#define XLOG_REC_SHIFT(log) \
+        BTOBB(1 << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
+         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+#define XLOG_TOTAL_REC_SHIFT(log) \
+        BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
+         XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+/*
+ *  set lsns
+ */
+#define ASSIGN_ANY_LSN_HOST(lsn,cycle,block)  \
+    { \
+        (lsn) = ((xfs_lsn_t)(cycle)<<32)|(block); \
+    }
+#define ASSIGN_ANY_LSN_DISK(lsn,cycle,block)  \
+    { \
+        INT_SET(((uint *)&(lsn))[0], ARCH_CONVERT, (cycle)); \
+        INT_SET(((uint *)&(lsn))[1], ARCH_CONVERT, (block)); \
+    }
+#define ASSIGN_LSN(lsn,log) \
+    ASSIGN_ANY_LSN_DISK(lsn,(log)->l_curr_cycle,(log)->l_curr_block);
+#define XLOG_SET(f,b)           (((f) & (b)) == (b))
+#define GET_CYCLE(ptr, arch) \
+    (INT_GET(*(uint *)(ptr), arch) == XLOG_HEADER_MAGIC_NUM ? \
+         INT_GET(*((uint *)(ptr)+1), arch) : \
+         INT_GET(*(uint *)(ptr), arch) \
+    )
+#define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
+#ifdef __KERNEL__
+/*
+ * get client id from packed copy.
+ *
+ * this hack is here because the xlog_pack code copies four bytes
+ * of xlog_op_header containing the fields oh_clientid, oh_flags
+ * and oh_res2 into the packed copy.
+ *
+ * later on this four byte chunk is treated as an int and the
+ * client id is pulled out.
+ *
+ * this has endian issues, of course.
+ */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define GET_CLIENT_ID(i,arch) \
+    ((i) & 0xff)
+#else
+#define GET_CLIENT_ID(i,arch) \
+    ((i) >> 24)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_SUB_SPACE)
+void xlog_grant_sub_space(struct log *log, int bytes, int type);
+#define XLOG_GRANT_SUB_SPACE(log,bytes,type)    \
+        xlog_grant_sub_space(log,bytes,type)
+#else
+#define XLOG_GRANT_SUB_SPACE(log,bytes,type)                            \
+    {                                                                   \
+        if (type == 'w') {                                              \
+                (log)->l_grant_write_bytes -= (bytes);                  \
+                if ((log)->l_grant_write_bytes < 0) {                   \
+                        (log)->l_grant_write_bytes += (log)->l_logsize; \
+                        (log)->l_grant_write_cycle--;                   \
+                }                                                       \
+        } else {                                                        \
+                (log)->l_grant_reserve_bytes -= (bytes);                \
+                if ((log)->l_grant_reserve_bytes < 0) {                 \
+                        (log)->l_grant_reserve_bytes += (log)->l_logsize;\
+                        (log)->l_grant_reserve_cycle--;                 \
+                }                                                       \
+         }                                                              \
+    }
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_ADD_SPACE)
+void xlog_grant_add_space(struct log *log, int bytes, int type);
+#define XLOG_GRANT_ADD_SPACE(log,bytes,type)    \
+        xlog_grant_add_space(log,bytes,type)
+#else
+#define XLOG_GRANT_ADD_SPACE(log,bytes,type)                            \
+    {                                                                   \
+        if (type == 'w') {                                              \
+                (log)->l_grant_write_bytes += (bytes);                  \
+                if ((log)->l_grant_write_bytes > (log)->l_logsize) {    \
+                        (log)->l_grant_write_bytes -= (log)->l_logsize; \
+                        (log)->l_grant_write_cycle++;                   \
+                }                                                       \
+        } else {                                                        \
+                (log)->l_grant_reserve_bytes += (bytes);                \
+                if ((log)->l_grant_reserve_bytes > (log)->l_logsize) {  \
+                        (log)->l_grant_reserve_bytes -= (log)->l_logsize;\
+                        (log)->l_grant_reserve_cycle++;                 \
+                }                                                       \
+         }                                                              \
+    }
+#endif
+#define XLOG_INS_TICKETQ(q,tic)                         \
+    {                                                   \
+        if (q) {                                        \
+                (tic)->t_next       = (q);              \
+                (tic)->t_prev       = (q)->t_prev;      \
+                (q)->t_prev->t_next = (tic);            \
+                (q)->t_prev         = (tic);            \
+        } else {                                        \
+                (tic)->t_prev = (tic)->t_next = (tic);  \
+                (q) = (tic);                            \
+        }                                               \
+        (tic)->t_flags |= XLOG_TIC_IN_Q;                \
+    }
+#define XLOG_DEL_TICKETQ(q,tic)                         \
+    {                                                   \
+        if ((tic) == (tic)->t_next) {                   \
+                (q) = NULL;                             \
+        } else {                                        \
+                (q) = (tic)->t_next;                    \
+                (tic)->t_next->t_prev = (tic)->t_prev;  \
+                (tic)->t_prev->t_next = (tic)->t_next;  \
+        }                                               \
+        (tic)->t_next = (tic)->t_prev = NULL;           \
+        (tic)->t_flags &= ~XLOG_TIC_IN_Q;               \
+    }
+#define GRANT_LOCK(log)         mutex_spinlock(&(log)->l_grant_lock)
+#define GRANT_UNLOCK(log, s)    mutex_spinunlock(&(log)->l_grant_lock, s)
+#define LOG_LOCK(log)           mutex_spinlock(&(log)->l_icloglock)
+#define LOG_UNLOCK(log, s)      mutex_spinunlock(&(log)->l_icloglock, s)
+#define xlog_panic(args...)     cmn_err(CE_PANIC, ## args)
+#define xlog_exit(args...)      cmn_err(CE_PANIC, ## args)
+#define xlog_warn(args...)      cmn_err(CE_WARN, ## args)
+/*
+ * In core log state
+ */
+#define XLOG_STATE_ACTIVE    0x0001 /* Current IC log being written to */
+#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
+#define XLOG_STATE_SYNCING   0x0004 /* This IC log is syncing */
+#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
+#define XLOG_STATE_DO_CALLBACK \
+                             0x0010 /* Process callback functions */
+#define XLOG_STATE_CALLBACK  0x0020 /* Callback functions now */
+#define XLOG_STATE_DIRTY     0x0040 /* Dirty IC log, not ready for ACTIVE status*/
+#define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_ALL       0x7FFF /* All possible valid flags */
+#define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
+#endif  /* __KERNEL__ */
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS        0x01    /* Start a new transaction */
+#define XLOG_COMMIT_TRANS       0x02    /* Commit this transaction */
+#define XLOG_CONTINUE_TRANS     0x04    /* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS     0x08    /* Cont this trans into new region */
+#define XLOG_END_TRANS          0x10    /* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS      0x20    /* Unmount a filesystem transaction */
+#define XLOG_SKIP_TRANS         (XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \
+                                 XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \
+                                 XLOG_UNMOUNT_TRANS)
+#ifdef __KERNEL__
+/*
+ * Flags to log ticket
+ */
+#define XLOG_TIC_INITED         0x1     /* has been initialized */
+#define XLOG_TIC_PERM_RESERV    0x2     /* permanent reservation */
+#define XLOG_TIC_IN_Q           0x4
+#endif  /* __KERNEL__ */
+#define XLOG_UNMOUNT_TYPE       0x556e  /* Un for Unmount */
+/*
+ * Flags for log structure
+ */
+#define XLOG_CHKSUM_MISMATCH    0x1     /* used only during recovery */
+#define XLOG_ACTIVE_RECOVERY    0x2     /* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
+#define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
+                                           shutdown */
+typedef __uint32_t xlog_tid_t;
+#ifdef __KERNEL__
+/*
+ * Below are states for covering allocation transactions.
+ * By covering, we mean changing the h_tail_lsn in the last on-disk
+ * log write such that no allocation transactions will be re-done during
+ * recovery after a system crash. Recovery starts at the last on-disk
+ * log write.
+ *
+ * These states are used to insert dummy log entries to cover
+ * space allocation transactions which can undo non-transactional changes
+ * after a crash. Writes to a file with space
+ * already allocated do not result in any transactions. Allocations
+ * might include space beyond the EOF. So if we just push the EOF a
+ * little, the last transaction for the file could contain the wrong
+ * size. If there is no file system activity, after an allocation
+ * transaction, and the system crashes, the allocation transaction
+ * will get replayed and the file will be truncated. This could
+ * be hours/days/... after the allocation occurred.
+ *
+ * The fix for this is to do two dummy transactions when the
+ * system is idle. We need two dummy transaction because the h_tail_lsn
+ * in the log record header needs to point beyond the last possible
+ * non-dummy transaction. The first dummy changes the h_tail_lsn to
+ * the first transaction before the dummy. The second dummy causes
+ * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
+ *
+ * These dummy transactions get committed when everything
+ * is idle (after there has been some activity).
+ *
+ * There are 5 states used to control this.
+ *
+ *  IDLE -- no logging has been done on the file system or
+ *              we are done covering previous transactions.
+ *  NEED -- logging has occurred and we need a dummy transaction
+ *              when the log becomes idle.
+ *  DONE -- we were in the NEED state and have committed a dummy
+ *              transaction.
+ *  NEED2 -- we detected that a dummy transaction has gone to the
+ *              on disk log with no other transactions.
+ *  DONE2 -- we committed a dummy transaction when in the NEED2 state.
+ *
+ * There are two places where we switch states:
+ *
+ * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
+ *      We commit the dummy transaction and switch to DONE or DONE2,
+ *      respectively. In all other states, we don't do anything.
+ *
+ * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
+ *
+ *      No matter what state we are in, if this isn't the dummy
+ *      transaction going out, the next state is NEED.
+ *      So, if we aren't in the DONE or DONE2 states, the next state
+ *      is NEED. We can't be finishing a write of the dummy record
+ *      unless it was committed and the state switched to DONE or DONE2.
+ *
+ *      If we are in the DONE state and this was a write of the
+ *              dummy transaction, we move to NEED2.
+ *
+ *      If we are in the DONE2 state and this was a write of the
+ *              dummy transaction, we move to IDLE.
+ *
+ *
+ * Writing only one dummy transaction can get appended to
+ * one file space allocation. When this happens, the log recovery
+ * code replays the space allocation and a file could be truncated.
+ * This is why we have the NEED2 and DONE2 states before going idle.
+ */
+#define XLOG_STATE_COVER_IDLE   0
+#define XLOG_STATE_COVER_NEED   1
+#define XLOG_STATE_COVER_DONE   2
+#define XLOG_STATE_COVER_NEED2  3
+#define XLOG_STATE_COVER_DONE2  4
+#define XLOG_COVER_OPS          5
+typedef struct xlog_ticket {
+        sv_t               t_sema;       /* sleep on this semaphore      :20 */
+        struct xlog_ticket *t_next;      /*                              : 4 */
+        struct xlog_ticket *t_prev;      /*                              : 4 */
+        xlog_tid_t         t_tid;        /* transaction identifier       : 4 */
+        int                t_curr_res;   /* current reservation in bytes : 4 */
+        int                t_unit_res;   /* unit reservation in bytes    : 4 */
+        __uint8_t          t_ocnt;       /* original count               : 1 */
+        __uint8_t          t_cnt;        /* current count                : 1 */
+        __uint8_t          t_clientid;   /* who does this belong to;     : 1 */
+        __uint8_t          t_flags;      /* properties of reservation    : 1 */
+} xlog_ticket_t;
+#endif
+typedef struct xlog_op_header {
+        xlog_tid_t oh_tid;      /* transaction id of operation  :  4 b */
+        int        oh_len;      /* bytes in data region         :  4 b */
+        __uint8_t  oh_clientid; /* who sent me this             :  1 b */
+        __uint8_t  oh_flags;    /*                              :  1 b */
+        ushort     oh_res2;     /* 32 bit align                 :  2 b */
+} xlog_op_header_t;
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+/* our fmt */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#else
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#error unknown byte order
+#endif
+#endif
+typedef struct xlog_rec_header {
+        uint      h_magicno;    /* log record (LR) identifier           :  4 */
+        uint      h_cycle;      /* write cycle of log                   :  4 */
+        int       h_version;    /* LR version                           :  4 */
+        int       h_len;        /* len in bytes; should be 64-bit aligned: 4 */
+        xfs_lsn_t h_lsn;        /* lsn of this LR                       :  8 */
+        xfs_lsn_t h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
+        uint      h_chksum;     /* may not be used; non-zero if used    :  4 */
+        int       h_prev_block; /* block number to previous LR          :  4 */
+        int       h_num_logops; /* number of log operations in this LR  :  4 */
+        uint      h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+        /* new fields */
+        int       h_fmt;        /* format of log record                 :  4 */
+        uuid_t    h_fs_uuid;    /* uuid of FS                           : 16 */
+        int       h_size;       /* iclog size                           :  4 */
+} xlog_rec_header_t;
+typedef struct xlog_rec_ext_header {
+        uint      xh_cycle;     /* write cycle of log                   : 4 */
+        uint      xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*    : 256 */
+} xlog_rec_ext_header_t;
+#ifdef __KERNEL__
+/*
+ * - A log record header is 512 bytes.  There is plenty of room to grow the
+ *      xlog_rec_header_t into the reserved space.
+ * - ic_data follows, so a write to disk can start at the beginning of
+ *      the iclog.
+ * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_next is the pointer to the next iclog in the ring.
+ * - ic_bp is a pointer to the buffer used to write this incore log to disk.
+ * - ic_log is a pointer back to the global log structure.
+ * - ic_callback is a linked list of callback function/argument pairs to be
+ *      called after an iclog finishes writing.
+ * - ic_size is the full size of the header plus data.
+ * - ic_offset is the current number of bytes written to in this iclog.
+ * - ic_refcnt is bumped when someone is writing to the log.
+ * - ic_state is the state of the iclog.
+ */
+typedef struct xlog_iclog_fields {
+        sv_t                    ic_forcesema;
+        sv_t                    ic_writesema;
+        struct xlog_in_core     *ic_next;
+        struct xlog_in_core     *ic_prev;
+        struct xfs_buf          *ic_bp;
+        struct log              *ic_log;
+        xfs_log_callback_t      *ic_callback;
+        xfs_log_callback_t      **ic_callback_tail;
+#ifdef XFS_LOG_TRACE
+        struct ktrace           *ic_trace;
+#endif
+        int                     ic_size;
+        int                     ic_offset;
+        int                     ic_refcnt;
+        int                     ic_bwritecnt;
+        ushort_t                ic_state;
+        char                    *ic_datap;      /* pointer to iclog data */
+} xlog_iclog_fields_t;
+typedef union xlog_in_core2 {
+        xlog_rec_header_t       hic_header;
+        xlog_rec_ext_header_t   hic_xheader;
+        char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+typedef struct xlog_in_core {
+        xlog_iclog_fields_t     hic_fields;
+        xlog_in_core_2_t        *hic_data;
+} xlog_in_core_t;
+/*
+ * Defines to save our code from this glop.
+ */
+#define ic_forcesema    hic_fields.ic_forcesema
+#define ic_writesema    hic_fields.ic_writesema
+#define ic_next         hic_fields.ic_next
+#define ic_prev         hic_fields.ic_prev
+#define ic_bp           hic_fields.ic_bp
+#define ic_log          hic_fields.ic_log
+#define ic_callback     hic_fields.ic_callback
+#define ic_callback_tail hic_fields.ic_callback_tail
+#define ic_trace        hic_fields.ic_trace
+#define ic_size         hic_fields.ic_size
+#define ic_offset       hic_fields.ic_offset
+#define ic_refcnt       hic_fields.ic_refcnt
+#define ic_bwritecnt    hic_fields.ic_bwritecnt
+#define ic_state        hic_fields.ic_state
+#define ic_datap        hic_fields.ic_datap
+#define ic_header       hic_data->hic_header
+/*
+ * The reservation head lsn is not made up of a cycle number and block number.
+ * Instead, it uses a cycle number and byte number.  Logs don't expect to
+ * overflow 31 bits worth of byte offset, so using a byte number will mean
+ * that round off problems won't occur when releasing partial reservations.
+ */
+typedef struct log {
+        /* The following block of fields are changed while holding icloglock */
+        sema_t                  l_flushsema;    /* iclog flushing semaphore */
+        int                     l_flushcnt;     /* # of procs waiting on this
+                                                 * sema */
+        int                     l_ticket_cnt;   /* free ticket count */
+        int                     l_ticket_tcnt;  /* total ticket count */
+        int                     l_covered_state;/* state of "covering disk
+                                                 * log entries" */
+        xlog_ticket_t           *l_freelist;    /* free list of tickets */
+        xlog_ticket_t           *l_unmount_free;/* kmem_free these addresses */
+        xlog_ticket_t           *l_tail;        /* free list of tickets */
+        xlog_in_core_t          *l_iclog;       /* head log queue       */
+        lock_t                  l_icloglock;    /* grab to change iclog state */
+        xfs_lsn_t               l_tail_lsn;     /* lsn of 1st LR with unflushed
+                                                 * buffers */
+        xfs_lsn_t               l_last_sync_lsn;/* lsn of last LR on disk */
+        struct xfs_mount        *l_mp;          /* mount point */
+        struct xfs_buf          *l_xbuf;        /* extra buffer for log
+                                                 * wrapping */
+        struct xfs_buftarg      *l_targ;        /* buftarg of log */
+        xfs_daddr_t             l_logBBstart;   /* start block of log */
+        int                     l_logsize;      /* size of log in bytes */
+        int                     l_logBBsize;    /* size of log in BB chunks */
+        int                     l_curr_cycle;   /* Cycle number of log writes */
+        int                     l_prev_cycle;   /* Cycle number before last
+                                                 * block increment */
+        int                     l_curr_block;   /* current logical log block */
+        int                     l_prev_block;   /* previous logical log block */
+        int                     l_iclog_size;   /* size of log in bytes */
+        int                     l_iclog_size_log; /* log power size of log */
+        int                     l_iclog_bufs;   /* number of iclog buffers */
+        /* The following field are used for debugging; need to hold icloglock */
+        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
+        /* The following block of fields are changed while holding grant_lock */
+        lock_t                  l_grant_lock;
+        xlog_ticket_t           *l_reserve_headq;
+        xlog_ticket_t           *l_write_headq;
+        int                     l_grant_reserve_cycle;
+        int                     l_grant_reserve_bytes;
+        int                     l_grant_write_cycle;
+        int                     l_grant_write_bytes;
+        /* The following fields don't need locking */
+#ifdef XFS_LOG_TRACE
+        struct ktrace           *l_trace;
+        struct ktrace           *l_grant_trace;
+#endif
+        uint                    l_flags;
+        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
+        struct xfs_buf_cancel   **l_buf_cancel_table;
+        int                     l_iclog_hsize;  /* size of iclog header */
+        int                     l_iclog_heads;  /* # of iclog header sectors */
+        uint                    l_sectbb_log;   /* log2 of sector size in BBs */
+        uint                    l_sectbb_mask;  /* sector size (in BBs)
+                                                 * alignment mask */
+} xlog_t;
+/* common routines */
+extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+extern int       xlog_find_head(xlog_t *log, xfs_daddr_t *head_blk);
+extern int       xlog_find_tail(xlog_t  *log,
+                                xfs_daddr_t *head_blk,
+                                xfs_daddr_t *tail_blk,
+                                int readonly);
+extern int       xlog_recover(xlog_t *log, int readonly);
+extern int       xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
+extern void      xlog_recover_process_iunlinks(xlog_t *log);
+extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
+extern void      xlog_put_bp(struct xfs_buf *);
+extern int       xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
+extern xfs_caddr_t xlog_align(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
+/* iclog tracing */
+#define XLOG_TRACE_GRAB_FLUSH  1
+#define XLOG_TRACE_REL_FLUSH   2
+#define XLOG_TRACE_SLEEP_FLUSH 3
+#define XLOG_TRACE_WAKE_FLUSH  4
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
new file mode 100644
index 000000000000..9824b5bf0ec0
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.c
@@ -0,0 +1,4098 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
+#include "xfs_trans.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_imap.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_log_recover.h"
+#include "xfs_extfree_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bit.h"
+#include "xfs_quota.h"
+#include "xfs_rw.h"
+STATIC int      xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
+STATIC int      xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
+STATIC void     xlog_recover_insert_item_backq(xlog_recover_item_t **q,
+                                               xlog_recover_item_t *item);
+#if defined(DEBUG)
+STATIC void     xlog_recover_check_summary(xlog_t *);
+STATIC void     xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
+#else
+#define xlog_recover_check_summary(log)
+#define xlog_recover_check_ail(mp, lip, gen)
+#endif
+/*
+ * Sector aligned buffer routines for buffer create/read/write/access
+ */
+#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)   \
+        ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
+        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
+#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)   ((bno) & ~(log)->l_sectbb_mask)
+xfs_buf_t *
+xlog_get_bp(
+        xlog_t          *log,
+        int             num_bblks)
+{
+        ASSERT(num_bblks > 0);
+        if (log->l_sectbb_log) {
+                if (num_bblks > 1)
+                        num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+                num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
+        }
+        return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
+}
+void
+xlog_put_bp(
+        xfs_buf_t       *bp)
+{
+        xfs_buf_free(bp);
+}
+/*
+ * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
+ */
+int
+xlog_bread(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,
+        int             nbblks,
+        xfs_buf_t       *bp)
+{
+        int             error;
+        if (log->l_sectbb_log) {
+                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
+        }
+        ASSERT(nbblks > 0);
+        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+        ASSERT(bp);
+        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+        XFS_BUF_READ(bp);
+        XFS_BUF_BUSY(bp);
+        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+        xfsbdstrat(log->l_mp, bp);
+        if ((error = xfs_iowait(bp)))
+                xfs_ioerror_alert("xlog_bread", log->l_mp,
+                                  bp, XFS_BUF_ADDR(bp));
+        return error;
+}
+/*
+ * Write out the buffer at the given block for the given number of blocks.
+ * The buffer is kept locked across the write and is returned locked.
+ * This can only be used for synchronous log writes.
+ */
+int
+xlog_bwrite(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,
+        int             nbblks,
+        xfs_buf_t       *bp)
+{
+        int             error;
+        if (log->l_sectbb_log) {
+                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
+                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
+        }
+        ASSERT(nbblks > 0);
+        ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+        XFS_BUF_ZEROFLAGS(bp);
+        XFS_BUF_BUSY(bp);
+        XFS_BUF_HOLD(bp);
+        XFS_BUF_PSEMA(bp, PRIBIO);
+        XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+        XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+        if ((error = xfs_bwrite(log->l_mp, bp)))
+                xfs_ioerror_alert("xlog_bwrite", log->l_mp,
+                                  bp, XFS_BUF_ADDR(bp));
+        return error;
+}
+xfs_caddr_t
+xlog_align(
+        xlog_t          *log,
+        xfs_daddr_t     blk_no,
+        int             nbblks,
+        xfs_buf_t       *bp)
+{
+        xfs_caddr_t     ptr;
+        if (!log->l_sectbb_log)
+                return XFS_BUF_PTR(bp);
+        ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
+        ASSERT(XFS_BUF_SIZE(bp) >=
+                BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
+        return ptr;
+}
+#ifdef DEBUG
+/*
+ * dump debug superblock and log record information
+ */
+STATIC void
+xlog_header_check_dump(
+        xfs_mount_t             *mp,
+        xlog_rec_header_t       *head)
+{
+        int                     b;
+        printk("%s:  SB : uuid = ", __FUNCTION__);
+        for (b = 0; b < 16; b++)
+                printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
+        printk(", fmt = %d\n", XLOG_FMT);
+        printk("    log : uuid = ");
+        for (b = 0; b < 16; b++)
+                printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
+        printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
+}
+#else
+#define xlog_header_check_dump(mp, head)
+#endif
+/*
+ * check log record header for recovery
+ */
+STATIC int
+xlog_header_check_recover(
+        xfs_mount_t             *mp,
+        xlog_rec_header_t       *head)
+{
+        ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+        /*
+         * IRIX doesn't write the h_fmt field and leaves it zeroed
+         * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
+         * a dirty log created in IRIX.
+         */
+        if (unlikely(INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT)) {
+                xlog_warn(
+        "XFS: dirty log written in incompatible format - can't recover");
+                xlog_header_check_dump(mp, head);
+                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
+                                 XFS_ERRLEVEL_HIGH, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+                xlog_warn(
+        "XFS: dirty log entry has mismatched uuid - can't recover");
+                xlog_header_check_dump(mp, head);
+                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
+                                 XFS_ERRLEVEL_HIGH, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+/*
+ * read the head block of the log and check the header
+ */
+STATIC int
+xlog_header_check_mount(
+        xfs_mount_t             *mp,
+        xlog_rec_header_t       *head)
+{
+        ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+        if (uuid_is_nil(&head->h_fs_uuid)) {
+                /*
+                 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
+                 * h_fs_uuid is nil, we assume this log was last mounted
+                 * by IRIX and continue.
+                 */
+                xlog_warn("XFS: nil uuid in log - IRIX style log");
+        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
+                xlog_warn("XFS: log has mismatched uuid - can't recover");
+                xlog_header_check_dump(mp, head);
+                XFS_ERROR_REPORT("xlog_header_check_mount",
+                                 XFS_ERRLEVEL_HIGH, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+STATIC void
+xlog_recover_iodone(
+        struct xfs_buf  *bp)
+{
+        xfs_mount_t     *mp;
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
+        if (XFS_BUF_GETERROR(bp)) {
+                /*
+                 * We're not going to bother about retrying
+                 * this during recovery. One strike!
+                 */
+                mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
+                xfs_ioerror_alert("xlog_recover_iodone",
+                                  mp, bp, XFS_BUF_ADDR(bp));
+                xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+        }
+        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+        XFS_BUF_CLR_IODONE_FUNC(bp);
+        xfs_biodone(bp);
+}
+/*
+ * This routine finds (to an approximation) the first block in the physical
+ * log which contains the given cycle.  It uses a binary search algorithm.
+ * Note that the algorithm can not be perfect because the disk will not
+ * necessarily be perfect.
+ */
+int
+xlog_find_cycle_start(
+        xlog_t          *log,
+        xfs_buf_t       *bp,
+        xfs_daddr_t     first_blk,
+        xfs_daddr_t     *last_blk,
+        uint            cycle)
+{
+        xfs_caddr_t     offset;
+        xfs_daddr_t     mid_blk;
+        uint            mid_cycle;
+        int             error;
+        mid_blk = BLK_AVG(first_blk, *last_blk);
+        while (mid_blk != first_blk && mid_blk != *last_blk) {
+                if ((error = xlog_bread(log, mid_blk, 1, bp)))
+                        return error;
+                offset = xlog_align(log, mid_blk, 1, bp);
+                mid_cycle = GET_CYCLE(offset, ARCH_CONVERT);
+                if (mid_cycle == cycle) {
+                        *last_blk = mid_blk;
+                        /* last_half_cycle == mid_cycle */
+                } else {
+                        first_blk = mid_blk;
+                        /* first_half_cycle == mid_cycle */
+                }
+                mid_blk = BLK_AVG(first_blk, *last_blk);
+        }
+        ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
+               (mid_blk == *last_blk && mid_blk-1 == first_blk));
+        return 0;
+}
+/*
+ * Check that the range of blocks does not contain the cycle number
+ * given.  The scan needs to occur from front to back and the ptr into the
+ * region must be updated since a later routine will need to perform another
+ * test.  If the region is completely good, we end up returning the same
+ * last block number.
+ *
+ * Set blkno to -1 if we encounter no errors.  This is an invalid block number
+ * since we don't ever expect logs to get this large.
+ */
+STATIC int
+xlog_find_verify_cycle(
+        xlog_t          *log,
+        xfs_daddr_t     start_blk,
+        int             nbblks,
+        uint            stop_on_cycle_no,
+        xfs_daddr_t     *new_blk)
+{
+        xfs_daddr_t     i, j;
+        uint            cycle;
+        xfs_buf_t       *bp;
+        xfs_daddr_t     bufblks;
+        xfs_caddr_t     buf = NULL;
+        int             error = 0;
+        bufblks = 1 << ffs(nbblks);
+        while (!(bp = xlog_get_bp(log, bufblks))) {
+                /* can't get enough memory to do everything in one big buffer */
+                bufblks >>= 1;
+                if (bufblks <= log->l_sectbb_log)
+                        return ENOMEM;
+        }
+        for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
+                int     bcount;
+                bcount = min(bufblks, (start_blk + nbblks - i));
+                if ((error = xlog_bread(log, i, bcount, bp)))
+                        goto out;
+                buf = xlog_align(log, i, bcount, bp);
+                for (j = 0; j < bcount; j++) {
+                        cycle = GET_CYCLE(buf, ARCH_CONVERT);
+                        if (cycle == stop_on_cycle_no) {
+                                *new_blk = i+j;
+                                goto out;
+                        }
+                        buf += BBSIZE;
+                }
+        }
+        *new_blk = -1;
+out:
+        xlog_put_bp(bp);
+        return error;
+}
+/*
+ * Potentially backup over partial log record write.
+ *
+ * In the typical case, last_blk is the number of the block directly after
+ * a good log record.  Therefore, we subtract one to get the block number
+ * of the last block in the given buffer.  extra_bblks contains the number
+ * of blocks we would have read on a previous read.  This happens when the
+ * last log record is split over the end of the physical log.
+ *
+ * extra_bblks is the number of blocks potentially verified on a previous
+ * call to this routine.
+ */
+STATIC int
+xlog_find_verify_log_record(
+        xlog_t                  *log,
+        xfs_daddr_t             start_blk,
+        xfs_daddr_t             *last_blk,
+        int                     extra_bblks)
+{
+        xfs_daddr_t             i;
+        xfs_buf_t               *bp;
+        xfs_caddr_t             offset = NULL;
+        xlog_rec_header_t       *head = NULL;
+        int                     error = 0;
+        int                     smallmem = 0;
+        int                     num_blks = *last_blk - start_blk;
+        int                     xhdrs;
+        ASSERT(start_blk != 0 || *last_blk != start_blk);
+        if (!(bp = xlog_get_bp(log, num_blks))) {
+                if (!(bp = xlog_get_bp(log, 1)))
+                        return ENOMEM;
+                smallmem = 1;
+        } else {
+                if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+                        goto out;
+                offset = xlog_align(log, start_blk, num_blks, bp);
+                offset += ((num_blks - 1) << BBSHIFT);
+        }
+        for (i = (*last_blk) - 1; i >= 0; i--) {
+                if (i < start_blk) {
+                        /* valid log record not found */
+                        xlog_warn(
+                "XFS: Log inconsistent (didn't find previous header)");
+                        ASSERT(0);
+                        error = XFS_ERROR(EIO);
+                        goto out;
+                }
+                if (smallmem) {
+                        if ((error = xlog_bread(log, i, 1, bp)))
+                                goto out;
+                        offset = xlog_align(log, i, 1, bp);
+                }
+                head = (xlog_rec_header_t *)offset;
+                if (XLOG_HEADER_MAGIC_NUM ==
+                    INT_GET(head->h_magicno, ARCH_CONVERT))
+                        break;
+                if (!smallmem)
+                        offset -= BBSIZE;
+        }
+        /*
+         * We hit the beginning of the physical log & still no header.  Return
+         * to caller.  If caller can handle a return of -1, then this routine
+         * will be called again for the end of the physical log.
+         */
+        if (i == -1) {
+                error = -1;
+                goto out;
+        }
+        /*
+         * We have the final block of the good log (the first block
+         * of the log record _before_ the head. So we check the uuid.
+         */
+        if ((error = xlog_header_check_mount(log->l_mp, head)))
+                goto out;
+        /*
+         * We may have found a log record header before we expected one.
+         * last_blk will be the 1st block # with a given cycle #.  We may end
+         * up reading an entire log record.  In this case, we don't want to
+         * reset last_blk.  Only when last_blk points in the middle of a log
+         * record do we update last_blk.
+         */
+        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+                uint    h_size = INT_GET(head->h_size, ARCH_CONVERT);
+                xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
+                if (h_size % XLOG_HEADER_CYCLE_SIZE)
+                        xhdrs++;
+        } else {
+                xhdrs = 1;
+        }
+        if (*last_blk - i + extra_bblks
+                        != BTOBB(INT_GET(head->h_len, ARCH_CONVERT)) + xhdrs)
+                *last_blk = i;
+out:
+        xlog_put_bp(bp);
+        return error;
+}
+/*
+ * Head is defined to be the point of the log where the next log write
+ * write could go.  This means that incomplete LR writes at the end are
+ * eliminated when calculating the head.  We aren't guaranteed that previous
+ * LR have complete transactions.  We only know that a cycle number of
+ * current cycle number -1 won't be present in the log if we start writing
+ * from our current block number.
+ *
+ * last_blk contains the block number of the first block with a given
+ * cycle number.
+ *
+ * Return: zero if normal, non-zero if error.
+ */
+int
+xlog_find_head(
+        xlog_t          *log,
+        xfs_daddr_t     *return_head_blk)
+{
+        xfs_buf_t       *bp;
+        xfs_caddr_t     offset;
+        xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
+        int             num_scan_bblks;
+        uint            first_half_cycle, last_half_cycle;
+        uint            stop_on_cycle;
+        int             error, log_bbnum = log->l_logBBsize;
+        /* Is the end of the log device zeroed? */
+        if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+                *return_head_blk = first_blk;
+                /* Is the whole lot zeroed? */
+                if (!first_blk) {
+                        /* Linux XFS shouldn't generate totally zeroed logs -
+                         * mkfs etc write a dummy unmount record to a fresh
+                         * log so we can store the uuid in there
+                         */
+                        xlog_warn("XFS: totally zeroed log");
+                }
+                return 0;
+        } else if (error) {
+                xlog_warn("XFS: empty log check failed");
+                return error;
+        }
+        first_blk = 0;                  /* get cycle # of 1st block */
+        bp = xlog_get_bp(log, 1);
+        if (!bp)
+                return ENOMEM;
+        if ((error = xlog_bread(log, 0, 1, bp)))
+                goto bp_err;
+        offset = xlog_align(log, 0, 1, bp);
+        first_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
+        last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
+        if ((error = xlog_bread(log, last_blk, 1, bp)))
+                goto bp_err;
+        offset = xlog_align(log, last_blk, 1, bp);
+        last_half_cycle = GET_CYCLE(offset, ARCH_CONVERT);
+        ASSERT(last_half_cycle != 0);
+        /*
+         * If the 1st half cycle number is equal to the last half cycle number,
+         * then the entire log is stamped with the same cycle number.  In this
+         * case, head_blk can't be set to zero (which makes sense).  The below
+         * math doesn't work out properly with head_blk equal to zero.  Instead,
+         * we set it to log_bbnum which is an invalid block number, but this
+         * value makes the math correct.  If head_blk doesn't changed through
+         * all the tests below, *head_blk is set to zero at the very end rather
+         * than log_bbnum.  In a sense, log_bbnum and zero are the same block
+         * in a circular file.
+         */
+        if (first_half_cycle == last_half_cycle) {
+                /*
+                 * In this case we believe that the entire log should have
+                 * cycle number last_half_cycle.  We need to scan backwards
+                 * from the end verifying that there are no holes still
+                 * containing last_half_cycle - 1.  If we find such a hole,
+                 * then the start of that hole will be the new head.  The
+                 * simple case looks like
+                 *        x | x ... | x - 1 | x
+                 * Another case that fits this picture would be
+                 *        x | x + 1 | x ... | x
+                 * In this case the head really is somwhere at the end of the
+                 * log, as one of the latest writes at the beginning was
+                 * incomplete.
+                 * One more case is
+                 *        x | x + 1 | x ... | x - 1 | x
+                 * This is really the combination of the above two cases, and
+                 * the head has to end up at the start of the x-1 hole at the
+                 * end of the log.
+                 *
+                 * In the 256k log case, we will read from the beginning to the
+                 * end of the log and search for cycle numbers equal to x-1.
+                 * We don't worry about the x+1 blocks that we encounter,
+                 * because we know that they cannot be the head since the log
+                 * started with x.
+                 */
+                head_blk = log_bbnum;
+                stop_on_cycle = last_half_cycle - 1;
+        } else {
+                /*
+                 * In this case we want to find the first block with cycle
+                 * number matching last_half_cycle.  We expect the log to be
+                 * some variation on
+                 *        x + 1 ... | x ...
+                 * The first block with cycle number x (last_half_cycle) will
+                 * be where the new head belongs.  First we do a binary search
+                 * for the first occurrence of last_half_cycle.  The binary
+                 * search may not be totally accurate, so then we scan back
+                 * from there looking for occurrences of last_half_cycle before
+                 * us.  If that backwards scan wraps around the beginning of
+                 * the log, then we look for occurrences of last_half_cycle - 1
+                 * at the end of the log.  The cases we're looking for look
+                 * like
+                 *        x + 1 ... | x | x + 1 | x ...
+                 *                               ^ binary search stopped here
+                 * or
+                 *        x + 1 ... | x ... | x - 1 | x
+                 *        <---------> less than scan distance
+                 */
+                stop_on_cycle = last_half_cycle;
+                if ((error = xlog_find_cycle_start(log, bp, first_blk,
+                                                &head_blk, last_half_cycle)))
+                        goto bp_err;
+        }
+        /*
+         * Now validate the answer.  Scan back some number of maximum possible
+         * blocks and make sure each one has the expected cycle number.  The
+         * maximum is determined by the total possible amount of buffering
+         * in the in-core log.  The following number can be made tighter if
+         * we actually look at the block size of the filesystem.
+         */
+        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+        if (head_blk >= num_scan_bblks) {
+                /*
+                 * We are guaranteed that the entire check can be performed
+                 * in one buffer.
+                 */
+                start_blk = head_blk - num_scan_bblks;
+                if ((error = xlog_find_verify_cycle(log,
+                                                start_blk, num_scan_bblks,
+                                                stop_on_cycle, &new_blk)))
+                        goto bp_err;
+                if (new_blk != -1)
+                        head_blk = new_blk;
+        } else {                /* need to read 2 parts of log */
+                /*
+                 * We are going to scan backwards in the log in two parts.
+                 * First we scan the physical end of the log.  In this part
+                 * of the log, we are looking for blocks with cycle number
+                 * last_half_cycle - 1.
+                 * If we find one, then we know that the log starts there, as
+                 * we've found a hole that didn't get written in going around
+                 * the end of the physical log.  The simple case for this is
+                 *        x + 1 ... | x ... | x - 1 | x
+                 *        <---------> less than scan distance
+                 * If all of the blocks at the end of the log have cycle number
+                 * last_half_cycle, then we check the blocks at the start of
+                 * the log looking for occurrences of last_half_cycle.  If we
+                 * find one, then our current estimate for the location of the
+                 * first occurrence of last_half_cycle is wrong and we move
+                 * back to the hole we've found.  This case looks like
+                 *        x + 1 ... | x | x + 1 | x ...
+                 *                               ^ binary search stopped here
+                 * Another case we need to handle that only occurs in 256k
+                 * logs is
+                 *        x + 1 ... | x ... | x+1 | x ...
+                 *                   ^ binary search stops here
+                 * In a 256k log, the scan at the end of the log will see the
+                 * x + 1 blocks.  We need to skip past those since that is
+                 * certainly not the head of the log.  By searching for
+                 * last_half_cycle-1 we accomplish that.
+                 */
+                start_blk = log_bbnum - num_scan_bblks + head_blk;
+                ASSERT(head_blk <= INT_MAX &&
+                        (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
+                if ((error = xlog_find_verify_cycle(log, start_blk,
+                                        num_scan_bblks - (int)head_blk,
+                                        (stop_on_cycle - 1), &new_blk)))
+                        goto bp_err;
+                if (new_blk != -1) {
+                        head_blk = new_blk;
+                        goto bad_blk;
+                }
+                /*
+                 * Scan beginning of log now.  The last part of the physical
+                 * log is good.  This scan needs to verify that it doesn't find
+                 * the last_half_cycle.
+                 */
+                start_blk = 0;
+                ASSERT(head_blk <= INT_MAX);
+                if ((error = xlog_find_verify_cycle(log,
+                                        start_blk, (int)head_blk,
+                                        stop_on_cycle, &new_blk)))
+                        goto bp_err;
+                if (new_blk != -1)
+                        head_blk = new_blk;
+        }
+ bad_blk:
+        /*
+         * Now we need to make sure head_blk is not pointing to a block in
+         * the middle of a log record.
+         */
+        num_scan_bblks = XLOG_REC_SHIFT(log);
+        if (head_blk >= num_scan_bblks) {
+                start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
+                /* start ptr at last block ptr before head_blk */
+                if ((error = xlog_find_verify_log_record(log, start_blk,
+                                                        &head_blk, 0)) == -1) {
+                        error = XFS_ERROR(EIO);
+                        goto bp_err;
+                } else if (error)
+                        goto bp_err;
+        } else {
+                start_blk = 0;
+                ASSERT(head_blk <= INT_MAX);
+                if ((error = xlog_find_verify_log_record(log, start_blk,
+                                                        &head_blk, 0)) == -1) {
+                        /* We hit the beginning of the log during our search */
+                        start_blk = log_bbnum - num_scan_bblks + head_blk;
+                        new_blk = log_bbnum;
+                        ASSERT(start_blk <= INT_MAX &&
+                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
+                        ASSERT(head_blk <= INT_MAX);
+                        if ((error = xlog_find_verify_log_record(log,
+                                                        start_blk, &new_blk,
+                                                        (int)head_blk)) == -1) {
+                                error = XFS_ERROR(EIO);
+                                goto bp_err;
+                        } else if (error)
+                                goto bp_err;
+                        if (new_blk != log_bbnum)
+                                head_blk = new_blk;
+                } else if (error)
+                        goto bp_err;
+        }
+        xlog_put_bp(bp);
+        if (head_blk == log_bbnum)
+                *return_head_blk = 0;
+        else
+                *return_head_blk = head_blk;
+        /*
+         * When returning here, we have a good block number.  Bad block
+         * means that during a previous crash, we didn't have a clean break
+         * from cycle number N to cycle number N-1.  In this case, we need
+         * to find the first block with cycle number N-1.
+         */
+        return 0;
+ bp_err:
+        xlog_put_bp(bp);
+        if (error)
+            xlog_warn("XFS: failed to find log head");
+        return error;
+}
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk.  Every log record header has
+ * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
+ * to get a sync block number.  The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn.  The entire log record does not need to be valid.  We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+int
+xlog_find_tail(
+        xlog_t                  *log,
+        xfs_daddr_t             *head_blk,
+        xfs_daddr_t             *tail_blk,
+        int                     readonly)
+{
+        xlog_rec_header_t       *rhead;
+        xlog_op_header_t        *op_head;
+        xfs_caddr_t             offset = NULL;
+        xfs_buf_t               *bp;
+        int                     error, i, found;
+        xfs_daddr_t             umount_data_blk;
+        xfs_daddr_t             after_umount_blk;
+        xfs_lsn_t               tail_lsn;
+        int                     hblks;
+        found = 0;
+        /*
+         * Find previous log record
+         */
+        if ((error = xlog_find_head(log, head_blk)))
+                return error;
+        bp = xlog_get_bp(log, 1);
+        if (!bp)
+                return ENOMEM;
+        if (*head_blk == 0) {                           /* special case */
+                if ((error = xlog_bread(log, 0, 1, bp)))
+                        goto bread_err;
+                offset = xlog_align(log, 0, 1, bp);
+                if (GET_CYCLE(offset, ARCH_CONVERT) == 0) {
+                        *tail_blk = 0;
+                        /* leave all other log inited values alone */
+                        goto exit;
+                }
+        }
+        /*
+         * Search backwards looking for log record header block
+         */
+        ASSERT(*head_blk < INT_MAX);
+        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
+                if ((error = xlog_bread(log, i, 1, bp)))
+                        goto bread_err;
+                offset = xlog_align(log, i, 1, bp);
+                if (XLOG_HEADER_MAGIC_NUM ==
+                    INT_GET(*(uint *)offset, ARCH_CONVERT)) {
+                        found = 1;
+                        break;
+                }
+        }
+        /*
+         * If we haven't found the log record header block, start looking
+         * again from the end of the physical log.  XXXmiken: There should be
+         * a check here to make sure we didn't search more than N blocks in
+         * the previous code.
+         */
+        if (!found) {
+                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
+                        if ((error = xlog_bread(log, i, 1, bp)))
+                                goto bread_err;
+                        offset = xlog_align(log, i, 1, bp);
+                        if (XLOG_HEADER_MAGIC_NUM ==
+                            INT_GET(*(uint*)offset, ARCH_CONVERT)) {
+                                found = 2;
+                                break;
+                        }
+                }
+        }
+        if (!found) {
+                xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+                ASSERT(0);
+                return XFS_ERROR(EIO);
+        }
+        /* find blk_no of tail of log */
+        rhead = (xlog_rec_header_t *)offset;
+        *tail_blk = BLOCK_LSN(INT_GET(rhead->h_tail_lsn, ARCH_CONVERT));
+        /*
+         * Reset log values according to the state of the log when we
+         * crashed.  In the case where head_blk == 0, we bump curr_cycle
+         * one because the next write starts a new cycle rather than
+         * continuing the cycle of the last good log record.  At this
+         * point we have guaranteed that all partial log records have been
+         * accounted for.  Therefore, we know that the last good log record
+         * written was complete and ended exactly on the end boundary
+         * of the physical log.
+         */
+        log->l_prev_block = i;
+        log->l_curr_block = (int)*head_blk;
+        log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
+        if (found == 2)
+                log->l_curr_cycle++;
+        log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
+        log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
+        log->l_grant_reserve_cycle = log->l_curr_cycle;
+        log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+        log->l_grant_write_cycle = log->l_curr_cycle;
+        log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+        /*
+         * Look for unmount record.  If we find it, then we know there
+         * was a clean unmount.  Since 'i' could be the last block in
+         * the physical log, we convert to a log block before comparing
+         * to the head_blk.
+         *
+         * Save the current tail lsn to use to pass to
+         * xlog_clear_stale_blocks() below.  We won't want to clear the
+         * unmount record if there is one, so we pass the lsn of the
+         * unmount record rather than the block after it.
+         */
+        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+                int     h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
+                int     h_version = INT_GET(rhead->h_version, ARCH_CONVERT);
+                if ((h_version & XLOG_VERSION_2) &&
+                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
+                                hblks++;
+                } else {
+                        hblks = 1;
+                }
+        } else {
+                hblks = 1;
+        }
+        after_umount_blk = (i + hblks + (int)
+                BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize;
+        tail_lsn = log->l_tail_lsn;
+        if (*head_blk == after_umount_blk &&
+            INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
+                umount_data_blk = (i + hblks) % log->l_logBBsize;
+                if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+                        goto bread_err;
+                }
+                offset = xlog_align(log, umount_data_blk, 1, bp);
+                op_head = (xlog_op_header_t *)offset;
+                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+                        /*
+                         * Set tail and last sync so that newly written
+                         * log records will point recovery to after the
+                         * current unmount record.
+                         */
+                        ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, log->l_curr_cycle,
+                                        after_umount_blk);
+                        ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle,
+                                        after_umount_blk);
+                        *tail_blk = after_umount_blk;
+                }
+        }
+        /*
+         * Make sure that there are no blocks in front of the head
+         * with the same cycle number as the head.  This can happen
+         * because we allow multiple outstanding log writes concurrently,
+         * and the later writes might make it out before earlier ones.
+         *
+         * We use the lsn from before modifying it so that we'll never
+         * overwrite the unmount record after a clean unmount.
+         *
+         * Do this only if we are going to recover the filesystem
+         *
+         * NOTE: This used to say "if (!readonly)"
+         * However on Linux, we can & do recover a read-only filesystem.
+         * We only skip recovery if NORECOVERY is specified on mount,
+         * in which case we would not be here.
+         *
+         * But... if the -device- itself is readonly, just skip this.
+         * We can't recover this device anyway, so it won't matter.
+         */
+        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+                error = xlog_clear_stale_blocks(log, tail_lsn);
+        }
+bread_err:
+exit:
+        xlog_put_bp(bp);
+        if (error)
+                xlog_warn("XFS: failed to locate log tail");
+        return error;
+}
+/*
+ * Is the log zeroed at all?
+ *
+ * The last binary search should be changed to perform an X block read
+ * once X becomes small enough.  You can then search linearly through
+ * the X blocks.  This will cut down on the number of reads we need to do.
+ *
+ * If the log is partially zeroed, this routine will pass back the blkno
+ * of the first block with cycle number 0.  It won't have a complete LR
+ * preceding it.
+ *
+ * Return:
+ *      0  => the log is completely written to
+ *      -1 => use *blk_no as the first block of the log
+ *      >0 => error has occurred
+ */
+int
+xlog_find_zeroed(
+        xlog_t          *log,
+        xfs_daddr_t     *blk_no)
+{
+        xfs_buf_t       *bp;
+        xfs_caddr_t     offset;
+        uint            first_cycle, last_cycle;
+        xfs_daddr_t     new_blk, last_blk, start_blk;
+        xfs_daddr_t     num_scan_bblks;
+        int             error, log_bbnum = log->l_logBBsize;
+        /* check totally zeroed log */
+        bp = xlog_get_bp(log, 1);
+        if (!bp)
+                return ENOMEM;
+        if ((error = xlog_bread(log, 0, 1, bp)))
+                goto bp_err;
+        offset = xlog_align(log, 0, 1, bp);
+        first_cycle = GET_CYCLE(offset, ARCH_CONVERT);
+        if (first_cycle == 0) {         /* completely zeroed log */
+                *blk_no = 0;
+                xlog_put_bp(bp);
+                return -1;
+        }
+        /* check partially zeroed log */
+        if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+                goto bp_err;
+        offset = xlog_align(log, log_bbnum-1, 1, bp);
+        last_cycle = GET_CYCLE(offset, ARCH_CONVERT);
+        if (last_cycle != 0) {          /* log completely written to */
+                xlog_put_bp(bp);
+                return 0;
+        } else if (first_cycle != 1) {
+                /*
+                 * If the cycle of the last block is zero, the cycle of
+                 * the first block must be 1. If it's not, maybe we're
+                 * not looking at a log... Bail out.
+                 */
+                xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+                return XFS_ERROR(EINVAL);
+        }
+        /* we have a partially zeroed log */
+        last_blk = log_bbnum-1;
+        if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
+                goto bp_err;
+        /*
+         * Validate the answer.  Because there is no way to guarantee that
+         * the entire log is made up of log records which are the same size,
+         * we scan over the defined maximum blocks.  At this point, the maximum
+         * is not chosen to mean anything special.   XXXmiken
+         */
+        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+        ASSERT(num_scan_bblks <= INT_MAX);
+        if (last_blk < num_scan_bblks)
+                num_scan_bblks = last_blk;
+        start_blk = last_blk - num_scan_bblks;
+        /*
+         * We search for any instances of cycle number 0 that occur before
+         * our current estimate of the head.  What we're trying to detect is
+         *        1 ... | 0 | 1 | 0...
+         *                       ^ binary search ends here
+         */
+        if ((error = xlog_find_verify_cycle(log, start_blk,
+                                         (int)num_scan_bblks, 0, &new_blk)))
+                goto bp_err;
+        if (new_blk != -1)
+                last_blk = new_blk;
+        /*
+         * Potentially backup over partial log record write.  We don't need
+         * to search the end of the log because we know it is zero.
+         */
+        if ((error = xlog_find_verify_log_record(log, start_blk,
+                                &last_blk, 0)) == -1) {
+            error = XFS_ERROR(EIO);
+            goto bp_err;
+        } else if (error)
+            goto bp_err;
+        *blk_no = last_blk;
+bp_err:
+        xlog_put_bp(bp);
+        if (error)
+                return error;
+        return -1;
+}
+/*
+ * These are simple subroutines used by xlog_clear_stale_blocks() below
+ * to initialize a buffer full of empty log record headers and write
+ * them into the log.
+ */
+STATIC void
+xlog_add_record(
+        xlog_t                  *log,
+        xfs_caddr_t             buf,
+        int                     cycle,
+        int                     block,
+        int                     tail_cycle,
+        int                     tail_block)
+{
+        xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
+        memset(buf, 0, BBSIZE);
+        INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
+        INT_SET(recp->h_cycle, ARCH_CONVERT, cycle);
+        INT_SET(recp->h_version, ARCH_CONVERT,
+                        XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+        ASSIGN_ANY_LSN_DISK(recp->h_lsn, cycle, block);
+        ASSIGN_ANY_LSN_DISK(recp->h_tail_lsn, tail_cycle, tail_block);
+        INT_SET(recp->h_fmt, ARCH_CONVERT, XLOG_FMT);
+        memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
+}
+STATIC int
+xlog_write_log_records(
+        xlog_t          *log,
+        int             cycle,
+        int             start_block,
+        int             blocks,
+        int             tail_cycle,
+        int             tail_block)
+{
+        xfs_caddr_t     offset;
+        xfs_buf_t       *bp;
+        int             balign, ealign;
+        int             sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+        int             end_block = start_block + blocks;
+        int             bufblks;
+        int             error = 0;
+        int             i, j = 0;
+        bufblks = 1 << ffs(blocks);
+        while (!(bp = xlog_get_bp(log, bufblks))) {
+                bufblks >>= 1;
+                if (bufblks <= log->l_sectbb_log)
+                        return ENOMEM;
+        }
+        /* We may need to do a read at the start to fill in part of
+         * the buffer in the starting sector not covered by the first
+         * write below.
+         */
+        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
+        if (balign != start_block) {
+                if ((error = xlog_bread(log, start_block, 1, bp))) {
+                        xlog_put_bp(bp);
+                        return error;
+                }
+                j = start_block - balign;
+        }
+        for (i = start_block; i < end_block; i += bufblks) {
+                int             bcount, endcount;
+                bcount = min(bufblks, end_block - start_block);
+                endcount = bcount - j;
+                /* We may need to do a read at the end to fill in part of
+                 * the buffer in the final sector not covered by the write.
+                 * If this is the same sector as the above read, skip it.
+                 */
+                ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
+                if (j == 0 && (start_block + endcount > ealign)) {
+                        offset = XFS_BUF_PTR(bp);
+                        balign = BBTOB(ealign - start_block);
+                        XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb));
+                        if ((error = xlog_bread(log, ealign, sectbb, bp)))
+                                break;
+                        XFS_BUF_SET_PTR(bp, offset, bufblks);
+                }
+                offset = xlog_align(log, start_block, endcount, bp);
+                for (; j < endcount; j++) {
+                        xlog_add_record(log, offset, cycle, i+j,
+                                        tail_cycle, tail_block);
+                        offset += BBSIZE;
+                }
+                error = xlog_bwrite(log, start_block, endcount, bp);
+                if (error)
+                        break;
+                start_block += endcount;
+                j = 0;
+        }
+        xlog_put_bp(bp);
+        return error;
+}
+/*
+ * This routine is called to blow away any incomplete log writes out
+ * in front of the log head.  We do this so that we won't become confused
+ * if we come up, write only a little bit more, and then crash again.
+ * If we leave the partial log records out there, this situation could
+ * cause us to think those partial writes are valid blocks since they
+ * have the current cycle number.  We get rid of them by overwriting them
+ * with empty log records with the old cycle number rather than the
+ * current one.
+ *
+ * The tail lsn is passed in rather than taken from
+ * the log so that we will not write over the unmount record after a
+ * clean unmount in a 512 block log.  Doing so would leave the log without
+ * any valid log records in it until a new one was written.  If we crashed
+ * during that time we would not be able to recover.
+ */
+STATIC int
+xlog_clear_stale_blocks(
+        xlog_t          *log,
+        xfs_lsn_t       tail_lsn)
+{
+        int             tail_cycle, head_cycle;
+        int             tail_block, head_block;
+        int             tail_distance, max_distance;
+        int             distance;
+        int             error;
+        tail_cycle = CYCLE_LSN(tail_lsn);
+        tail_block = BLOCK_LSN(tail_lsn);
+        head_cycle = log->l_curr_cycle;
+        head_block = log->l_curr_block;
+        /*
+         * Figure out the distance between the new head of the log
+         * and the tail.  We want to write over any blocks beyond the
+         * head that we may have written just before the crash, but
+         * we don't want to overwrite the tail of the log.
+         */
+        if (head_cycle == tail_cycle) {
+                /*
+                 * The tail is behind the head in the physical log,
+                 * so the distance from the head to the tail is the
+                 * distance from the head to the end of the log plus
+                 * the distance from the beginning of the log to the
+                 * tail.
+                 */
+                if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
+                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
+                                         XFS_ERRLEVEL_LOW, log->l_mp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                tail_distance = tail_block + (log->l_logBBsize - head_block);
+        } else {
+                /*
+                 * The head is behind the tail in the physical log,
+                 * so the distance from the head to the tail is just
+                 * the tail block minus the head block.
+                 */
+                if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
+                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
+                                         XFS_ERRLEVEL_LOW, log->l_mp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                tail_distance = tail_block - head_block;
+        }
+        /*
+         * If the head is right up against the tail, we can't clear
+         * anything.
+         */
+        if (tail_distance <= 0) {
+                ASSERT(tail_distance == 0);
+                return 0;
+        }
+        max_distance = XLOG_TOTAL_REC_SHIFT(log);
+        /*
+         * Take the smaller of the maximum amount of outstanding I/O
+         * we could have and the distance to the tail to clear out.
+         * We take the smaller so that we don't overwrite the tail and
+         * we don't waste all day writing from the head to the tail
+         * for no reason.
+         */
+        max_distance = MIN(max_distance, tail_distance);
+        if ((head_block + max_distance) <= log->l_logBBsize) {
+                /*
+                 * We can stomp all the blocks we need to without
+                 * wrapping around the end of the log.  Just do it
+                 * in a single write.  Use the cycle number of the
+                 * current cycle minus one so that the log will look like:
+                 *     n ... | n - 1 ...
+                 */
+                error = xlog_write_log_records(log, (head_cycle - 1),
+                                head_block, max_distance, tail_cycle,
+                                tail_block);
+                if (error)
+                        return error;
+        } else {
+                /*
+                 * We need to wrap around the end of the physical log in
+                 * order to clear all the blocks.  Do it in two separate
+                 * I/Os.  The first write should be from the head to the
+                 * end of the physical log, and it should use the current
+                 * cycle number minus one just like above.
+                 */
+                distance = log->l_logBBsize - head_block;
+                error = xlog_write_log_records(log, (head_cycle - 1),
+                                head_block, distance, tail_cycle,
+                                tail_block);
+                if (error)
+                        return error;
+                /*
+                 * Now write the blocks at the start of the physical log.
+                 * This writes the remainder of the blocks we want to clear.
+                 * It uses the current cycle number since we're now on the
+                 * same cycle as the head so that we get:
+                 *    n ... n ... | n - 1 ...
+                 *    ^^^^^ blocks we're writing
+                 */
+                distance = max_distance - (log->l_logBBsize - head_block);
+                error = xlog_write_log_records(log, head_cycle, 0, distance,
+                                tail_cycle, tail_block);
+                if (error)
+                        return error;
+        }
+        return 0;
+}
+/******************************************************************************
+ *
+ *              Log recover routines
+ *
+ ******************************************************************************
+ */
+STATIC xlog_recover_t *
+xlog_recover_find_tid(
+        xlog_recover_t          *q,
+        xlog_tid_t              tid)
+{
+        xlog_recover_t          *p = q;
+        while (p != NULL) {
+                if (p->r_log_tid == tid)
+                    break;
+                p = p->r_next;
+        }
+        return p;
+}
+STATIC void
+xlog_recover_put_hashq(
+        xlog_recover_t          **q,
+        xlog_recover_t          *trans)
+{
+        trans->r_next = *q;
+        *q = trans;
+}
+STATIC void
+xlog_recover_add_item(
+        xlog_recover_item_t     **itemq)
+{
+        xlog_recover_item_t     *item;
+        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
+        xlog_recover_insert_item_backq(itemq, item);
+}
+STATIC int
+xlog_recover_add_to_cont_trans(
+        xlog_recover_t          *trans,
+        xfs_caddr_t             dp,
+        int                     len)
+{
+        xlog_recover_item_t     *item;
+        xfs_caddr_t             ptr, old_ptr;
+        int                     old_len;
+        item = trans->r_itemq;
+        if (item == 0) {
+                /* finish copying rest of trans header */
+                xlog_recover_add_item(&trans->r_itemq);
+                ptr = (xfs_caddr_t) &trans->r_theader +
+                                sizeof(xfs_trans_header_t) - len;
+                memcpy(ptr, dp, len); /* d, s, l */
+                return 0;
+        }
+        item = item->ri_prev;
+        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+        old_len = item->ri_buf[item->ri_cnt-1].i_len;
+        ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
+        memcpy(&ptr[old_len], dp, len); /* d, s, l */
+        item->ri_buf[item->ri_cnt-1].i_len += len;
+        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+        return 0;
+}
+/*
+ * The next region to add is the start of a new region.  It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(
+        xlog_recover_t          *trans,
+        xfs_caddr_t             dp,
+        int                     len)
+{
+        xfs_inode_log_format_t  *in_f;                  /* any will do */
+        xlog_recover_item_t     *item;
+        xfs_caddr_t             ptr;
+        if (!len)
+                return 0;
+        item = trans->r_itemq;
+        if (item == 0) {
+                ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+                if (len == sizeof(xfs_trans_header_t))
+                        xlog_recover_add_item(&trans->r_itemq);
+                memcpy(&trans->r_theader, dp, len); /* d, s, l */
+                return 0;
+        }
+        ptr = kmem_alloc(len, KM_SLEEP);
+        memcpy(ptr, dp, len);
+        in_f = (xfs_inode_log_format_t *)ptr;
+        if (item->ri_prev->ri_total != 0 &&
+             item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
+                xlog_recover_add_item(&trans->r_itemq);
+        }
+        item = trans->r_itemq;
+        item = item->ri_prev;
+        if (item->ri_total == 0) {              /* first region to be added */
+                item->ri_total  = in_f->ilf_size;
+                ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
+                item->ri_buf = kmem_zalloc((item->ri_total *
+                                            sizeof(xfs_log_iovec_t)), KM_SLEEP);
+        }
+        ASSERT(item->ri_total > item->ri_cnt);
+        /* Description region is ri_buf[0] */
+        item->ri_buf[item->ri_cnt].i_addr = ptr;
+        item->ri_buf[item->ri_cnt].i_len  = len;
+        item->ri_cnt++;
+        return 0;
+}
+STATIC void
+xlog_recover_new_tid(
+        xlog_recover_t          **q,
+        xlog_tid_t              tid,
+        xfs_lsn_t               lsn)
+{
+        xlog_recover_t          *trans;
+        trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+        trans->r_log_tid   = tid;
+        trans->r_lsn       = lsn;
+        xlog_recover_put_hashq(q, trans);
+}
+STATIC int
+xlog_recover_unlink_tid(
+        xlog_recover_t          **q,
+        xlog_recover_t          *trans)
+{
+        xlog_recover_t          *tp;
+        int                     found = 0;
+        ASSERT(trans != 0);
+        if (trans == *q) {
+                *q = (*q)->r_next;
+        } else {
+                tp = *q;
+                while (tp != 0) {
+                        if (tp->r_next == trans) {
+                                found = 1;
+                                break;
+                        }
+                        tp = tp->r_next;
+                }
+                if (!found) {
+                        xlog_warn(
+                             "XFS: xlog_recover_unlink_tid: trans not found");
+                        ASSERT(0);
+                        return XFS_ERROR(EIO);
+                }
+                tp->r_next = tp->r_next->r_next;
+        }
+        return 0;
+}
+STATIC void
+xlog_recover_insert_item_backq(
+        xlog_recover_item_t     **q,
+        xlog_recover_item_t     *item)
+{
+        if (*q == 0) {
+                item->ri_prev = item->ri_next = item;
+                *q = item;
+        } else {
+                item->ri_next           = *q;
+                item->ri_prev           = (*q)->ri_prev;
+                (*q)->ri_prev           = item;
+                item->ri_prev->ri_next  = item;
+        }
+}
+STATIC void
+xlog_recover_insert_item_frontq(
+        xlog_recover_item_t     **q,
+        xlog_recover_item_t     *item)
+{
+        xlog_recover_insert_item_backq(q, item);
+        *q = item;
+}
+STATIC int
+xlog_recover_reorder_trans(
+        xlog_t                  *log,
+        xlog_recover_t          *trans)
+{
+        xlog_recover_item_t     *first_item, *itemq, *itemq_next;
+        xfs_buf_log_format_t    *buf_f;
+        xfs_buf_log_format_v1_t *obuf_f;
+        ushort                  flags = 0;
+        first_item = itemq = trans->r_itemq;
+        trans->r_itemq = NULL;
+        do {
+                itemq_next = itemq->ri_next;
+                buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
+                switch (ITEM_TYPE(itemq)) {
+                case XFS_LI_BUF:
+                        flags = buf_f->blf_flags;
+                        break;
+                case XFS_LI_6_1_BUF:
+                case XFS_LI_5_3_BUF:
+                        obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+                        flags = obuf_f->blf_flags;
+                        break;
+                }
+                switch (ITEM_TYPE(itemq)) {
+                case XFS_LI_BUF:
+                case XFS_LI_6_1_BUF:
+                case XFS_LI_5_3_BUF:
+                        if (!(flags & XFS_BLI_CANCEL)) {
+                                xlog_recover_insert_item_frontq(&trans->r_itemq,
+                                                                itemq);
+                                break;
+                        }
+                case XFS_LI_INODE:
+                case XFS_LI_6_1_INODE:
+                case XFS_LI_5_3_INODE:
+                case XFS_LI_DQUOT:
+                case XFS_LI_QUOTAOFF:
+                case XFS_LI_EFD:
+                case XFS_LI_EFI:
+                        xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
+                        break;
+                default:
+                        xlog_warn(
+        "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+                        ASSERT(0);
+                        return XFS_ERROR(EIO);
+                }
+                itemq = itemq_next;
+        } while (first_item != itemq);
+        return 0;
+}
+/*
+ * Build up the table of buf cancel records so that we don't replay
+ * cancelled data in the second pass.  For buffer records that are
+ * not cancel records, there is nothing to do here so we just return.
+ *
+ * If we get a cancel record which is already in the table, this indicates
+ * that the buffer was cancelled multiple times.  In order to ensure
+ * that during pass 2 we keep the record in the table until we reach its
+ * last occurrence in the log, we keep a reference count in the cancel
+ * record in the table to tell us how many times we expect to see this
+ * record during the second pass.
+ */
+STATIC void
+xlog_recover_do_buffer_pass1(
+        xlog_t                  *log,
+        xfs_buf_log_format_t    *buf_f)
+{
+        xfs_buf_cancel_t        *bcp;
+        xfs_buf_cancel_t        *nextp;
+        xfs_buf_cancel_t        *prevp;
+        xfs_buf_cancel_t        **bucket;
+        xfs_buf_log_format_v1_t *obuf_f;
+        xfs_daddr_t             blkno = 0;
+        uint                    len = 0;
+        ushort                  flags = 0;
+        switch (buf_f->blf_type) {
+        case XFS_LI_BUF:
+                blkno = buf_f->blf_blkno;
+                len = buf_f->blf_len;
+                flags = buf_f->blf_flags;
+                break;
+        case XFS_LI_6_1_BUF:
+        case XFS_LI_5_3_BUF:
+                obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+                blkno = (xfs_daddr_t) obuf_f->blf_blkno;
+                len = obuf_f->blf_len;
+                flags = obuf_f->blf_flags;
+                break;
+        }
+        /*
+         * If this isn't a cancel buffer item, then just return.
+         */
+        if (!(flags & XFS_BLI_CANCEL))
+                return;
+        /*
+         * Insert an xfs_buf_cancel record into the hash table of
+         * them.  If there is already an identical record, bump
+         * its reference count.
+         */
+        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+                                          XLOG_BC_TABLE_SIZE];
+        /*
+         * If the hash bucket is empty then just insert a new record into
+         * the bucket.
+         */
+        if (*bucket == NULL) {
+                bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+                                                     KM_SLEEP);
+                bcp->bc_blkno = blkno;
+                bcp->bc_len = len;
+                bcp->bc_refcount = 1;
+                bcp->bc_next = NULL;
+                *bucket = bcp;
+                return;
+        }
+        /*
+         * The hash bucket is not empty, so search for duplicates of our
+         * record.  If we find one them just bump its refcount.  If not
+         * then add us at the end of the list.
+         */
+        prevp = NULL;
+        nextp = *bucket;
+        while (nextp != NULL) {
+                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+                        nextp->bc_refcount++;
+                        return;
+                }
+                prevp = nextp;
+                nextp = nextp->bc_next;
+        }
+        ASSERT(prevp != NULL);
+        bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
+                                             KM_SLEEP);
+        bcp->bc_blkno = blkno;
+        bcp->bc_len = len;
+        bcp->bc_refcount = 1;
+        bcp->bc_next = NULL;
+        prevp->bc_next = bcp;
+}
+/*
+ * Check to see whether the buffer being recovered has a corresponding
+ * entry in the buffer cancel record table.  If it does then return 1
+ * so that it will be cancelled, otherwise return 0.  If the buffer is
+ * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * the refcount on the entry in the table and remove it from the table
+ * if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its
+ * last occurrence in the log so that if the same buffer is re-used
+ * again after its last cancellation we actually replay the changes
+ * made at that point.
+ */
+STATIC int
+xlog_check_buffer_cancelled(
+        xlog_t                  *log,
+        xfs_daddr_t             blkno,
+        uint                    len,
+        ushort                  flags)
+{
+        xfs_buf_cancel_t        *bcp;
+        xfs_buf_cancel_t        *prevp;
+        xfs_buf_cancel_t        **bucket;
+        if (log->l_buf_cancel_table == NULL) {
+                /*
+                 * There is nothing in the table built in pass one,
+                 * so this buffer must not be cancelled.
+                 */
+                ASSERT(!(flags & XFS_BLI_CANCEL));
+                return 0;
+        }
+        bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+                                          XLOG_BC_TABLE_SIZE];
+        bcp = *bucket;
+        if (bcp == NULL) {
+                /*
+                 * There is no corresponding entry in the table built
+                 * in pass one, so this buffer has not been cancelled.
+                 */
+                ASSERT(!(flags & XFS_BLI_CANCEL));
+                return 0;
+        }
+        /*
+         * Search for an entry in the buffer cancel table that
+         * matches our buffer.
+         */
+        prevp = NULL;
+        while (bcp != NULL) {
+                if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+                        /*
+                         * We've go a match, so return 1 so that the
+                         * recovery of this buffer is cancelled.
+                         * If this buffer is actually a buffer cancel
+                         * log item, then decrement the refcount on the
+                         * one in the table and remove it if this is the
+                         * last reference.
+                         */
+                        if (flags & XFS_BLI_CANCEL) {
+                                bcp->bc_refcount--;
+                                if (bcp->bc_refcount == 0) {
+                                        if (prevp == NULL) {
+                                                *bucket = bcp->bc_next;
+                                        } else {
+                                                prevp->bc_next = bcp->bc_next;
+                                        }
+                                        kmem_free(bcp,
+                                                  sizeof(xfs_buf_cancel_t));
+                                }
+                        }
+                        return 1;
+                }
+                prevp = bcp;
+                bcp = bcp->bc_next;
+        }
+        /*
+         * We didn't find a corresponding entry in the table, so
+         * return 0 so that the buffer is NOT cancelled.
+         */
+        ASSERT(!(flags & XFS_BLI_CANCEL));
+        return 0;
+}
+STATIC int
+xlog_recover_do_buffer_pass2(
+        xlog_t                  *log,
+        xfs_buf_log_format_t    *buf_f)
+{
+        xfs_buf_log_format_v1_t *obuf_f;
+        xfs_daddr_t             blkno = 0;
+        ushort                  flags = 0;
+        uint                    len = 0;
+        switch (buf_f->blf_type) {
+        case XFS_LI_BUF:
+                blkno = buf_f->blf_blkno;
+                flags = buf_f->blf_flags;
+                len = buf_f->blf_len;
+                break;
+        case XFS_LI_6_1_BUF:
+        case XFS_LI_5_3_BUF:
+                obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+                blkno = (xfs_daddr_t) obuf_f->blf_blkno;
+                flags = obuf_f->blf_flags;
+                len = (xfs_daddr_t) obuf_f->blf_len;
+                break;
+        }
+        return xlog_check_buffer_cancelled(log, blkno, len, flags);
+}
+/*
+ * Perform recovery for a buffer full of inodes.  In these buffers,
+ * the only data which should be recovered is that which corresponds
+ * to the di_next_unlinked pointers in the on disk inode structures.
+ * The rest of the data for the inodes is always logged through the
+ * inodes themselves rather than the inode buffer and is recovered
+ * in xlog_recover_do_inode_trans().
+ *
+ * The only time when buffers full of inodes are fully recovered is
+ * when the buffer is full of newly allocated inodes.  In this case
+ * the buffer will not be marked as an inode buffer and so will be
+ * sent to xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(
+        xfs_mount_t             *mp,
+        xlog_recover_item_t     *item,
+        xfs_buf_t               *bp,
+        xfs_buf_log_format_t    *buf_f)
+{
+        int                     i;
+        int                     item_index;
+        int                     bit;
+        int                     nbits;
+        int                     reg_buf_offset;
+        int                     reg_buf_bytes;
+        int                     next_unlinked_offset;
+        int                     inodes_per_buf;
+        xfs_agino_t             *logged_nextp;
+        xfs_agino_t             *buffer_nextp;
+        xfs_buf_log_format_v1_t *obuf_f;
+        unsigned int            *data_map = NULL;
+        unsigned int            map_size = 0;
+        switch (buf_f->blf_type) {
+        case XFS_LI_BUF:
+                data_map = buf_f->blf_data_map;
+                map_size = buf_f->blf_map_size;
+                break;
+        case XFS_LI_6_1_BUF:
+        case XFS_LI_5_3_BUF:
+                obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+                data_map = obuf_f->blf_data_map;
+                map_size = obuf_f->blf_map_size;
+                break;
+        }
+        /*
+         * Set the variables corresponding to the current region to
+         * 0 so that we'll initialize them on the first pass through
+         * the loop.
+         */
+        reg_buf_offset = 0;
+        reg_buf_bytes = 0;
+        bit = 0;
+        nbits = 0;
+        item_index = 0;
+        inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+        for (i = 0; i < inodes_per_buf; i++) {
+                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+                        offsetof(xfs_dinode_t, di_next_unlinked);
+                while (next_unlinked_offset >=
+                       (reg_buf_offset + reg_buf_bytes)) {
+                        /*
+                         * The next di_next_unlinked field is beyond
+                         * the current logged region.  Find the next
+                         * logged region that contains or is beyond
+                         * the current di_next_unlinked field.
+                         */
+                        bit += nbits;
+                        bit = xfs_next_bit(data_map, map_size, bit);
+                        /*
+                         * If there are no more logged regions in the
+                         * buffer, then we're done.
+                         */
+                        if (bit == -1) {
+                                return 0;
+                        }
+                        nbits = xfs_contig_bits(data_map, map_size,
+                                                         bit);
+                        ASSERT(nbits > 0);
+                        reg_buf_offset = bit << XFS_BLI_SHIFT;
+                        reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+                        item_index++;
+                }
+                /*
+                 * If the current logged region starts after the current
+                 * di_next_unlinked field, then move on to the next
+                 * di_next_unlinked field.
+                 */
+                if (next_unlinked_offset < reg_buf_offset) {
+                        continue;
+                }
+                ASSERT(item->ri_buf[item_index].i_addr != NULL);
+                ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+                ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+                /*
+                 * The current logged region contains a copy of the
+                 * current di_next_unlinked field.  Extract its value
+                 * and copy it to the buffer copy.
+                 */
+                logged_nextp = (xfs_agino_t *)
+                               ((char *)(item->ri_buf[item_index].i_addr) +
+                                (next_unlinked_offset - reg_buf_offset));
+                if (unlikely(*logged_nextp == 0)) {
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+                                item, bp);
+                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
+                                         XFS_ERRLEVEL_LOW, mp);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+                buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
+                                              next_unlinked_offset);
+                INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
+        }
+        return 0;
+}
+/*
+ * Perform a 'normal' buffer recovery.  Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer.  The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+/*ARGSUSED*/
+STATIC void
+xlog_recover_do_reg_buffer(
+        xfs_mount_t             *mp,
+        xlog_recover_item_t     *item,
+        xfs_buf_t               *bp,
+        xfs_buf_log_format_t    *buf_f)
+{
+        int                     i;
+        int                     bit;
+        int                     nbits;
+        xfs_buf_log_format_v1_t *obuf_f;
+        unsigned int            *data_map = NULL;
+        unsigned int            map_size = 0;
+        int                     error;
+        switch (buf_f->blf_type) {
+        case XFS_LI_BUF:
+                data_map = buf_f->blf_data_map;
+                map_size = buf_f->blf_map_size;
+                break;
+        case XFS_LI_6_1_BUF:
+        case XFS_LI_5_3_BUF:
+                obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+                data_map = obuf_f->blf_data_map;
+                map_size = obuf_f->blf_map_size;
+                break;
+        }
+        bit = 0;
+        i = 1;  /* 0 is the buf format structure */
+        while (1) {
+                bit = xfs_next_bit(data_map, map_size, bit);
+                if (bit == -1)
+                        break;
+                nbits = xfs_contig_bits(data_map, map_size, bit);
+                ASSERT(nbits > 0);
+                ASSERT(item->ri_buf[i].i_addr != 0);
+                ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+                ASSERT(XFS_BUF_COUNT(bp) >=
+                       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+                /*
+                 * Do a sanity check if this is a dquot buffer. Just checking
+                 * the first dquot in the buffer should do. XXXThis is
+                 * probably a good thing to do for other buf types also.
+                 */
+                error = 0;
+                if (buf_f->blf_flags & (XFS_BLI_UDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                        error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
+                                               item->ri_buf[i].i_addr,
+                                               -1, 0, XFS_QMOPT_DOWARN,
+                                               "dquot_buf_recover");
+                }
+                if (!error)
+                        memcpy(xfs_buf_offset(bp,
+                                (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                                item->ri_buf[i].i_addr,         /* source */
+                                nbits<<XFS_BLI_SHIFT);          /* length */
+                i++;
+                bit += nbits;
+        }
+        /* Shouldn't be any more regions */
+        ASSERT(i == item->ri_total);
+}
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_qm_dqcheck(
+        xfs_disk_dquot_t *ddq,
+        xfs_dqid_t       id,
+        uint             type,    /* used only when IO_dorepair is true */
+        uint             flags,
+        char             *str)
+{
+        xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
+        int             errs = 0;
+        /*
+         * We can encounter an uninitialized dquot buffer for 2 reasons:
+         * 1. If we crash while deleting the quotainode(s), and those blks got
+         *    used for user data. This is because we take the path of regular
+         *    file deletion; however, the size field of quotainodes is never
+         *    updated, so all the tricks that we play in itruncate_finish
+         *    don't quite matter.
+         *
+         * 2. We don't play the quota buffers when there's a quotaoff logitem.
+         *    But the allocation will be replayed so we'll end up with an
+         *    uninitialized quota block.
+         *
+         * This is all fine; things are still consistent, and we haven't lost
+         * any quota information. Just don't complain about bad dquot blks.
+         */
+        if (INT_GET(ddq->d_magic, ARCH_CONVERT) != XFS_DQUOT_MAGIC) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        cmn_err(CE_ALERT,
+                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+                        str, id,
+                        INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_MAGIC);
+                errs++;
+        }
+        if (INT_GET(ddq->d_version, ARCH_CONVERT) != XFS_DQUOT_VERSION) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        cmn_err(CE_ALERT,
+                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+                        str, id,
+                        INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_VERSION);
+                errs++;
+        }
+        if (INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_USER &&
+            INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_GROUP) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        cmn_err(CE_ALERT,
+                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+                        str, id, INT_GET(ddq->d_flags, ARCH_CONVERT));
+                errs++;
+        }
+        if (id != -1 && id != INT_GET(ddq->d_id, ARCH_CONVERT)) {
+                if (flags & XFS_QMOPT_DOWARN)
+                        cmn_err(CE_ALERT,
+                        "%s : ondisk-dquot 0x%p, ID mismatch: "
+                        "0x%x expected, found id 0x%x",
+                        str, ddq, id, INT_GET(ddq->d_id, ARCH_CONVERT));
+                errs++;
+        }
+        if (!errs && ddq->d_id) {
+                if (INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT) &&
+                    INT_GET(ddq->d_bcount, ARCH_CONVERT) >=
+                                INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT)) {
+                        if (!ddq->d_btimer) {
+                                if (flags & XFS_QMOPT_DOWARN)
+                                        cmn_err(CE_ALERT,
+                                        "%s : Dquot ID 0x%x (0x%p) "
+                                        "BLK TIMER NOT STARTED",
+                                        str, (int)
+                                        INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
+                                errs++;
+                        }
+                }
+                if (INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT) &&
+                    INT_GET(ddq->d_icount, ARCH_CONVERT) >=
+                                INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT)) {
+                        if (!ddq->d_itimer) {
+                                if (flags & XFS_QMOPT_DOWARN)
+                                        cmn_err(CE_ALERT,
+                                        "%s : Dquot ID 0x%x (0x%p) "
+                                        "INODE TIMER NOT STARTED",
+                                        str, (int)
+                                        INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
+                                errs++;
+                        }
+                }
+                if (INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT) &&
+                    INT_GET(ddq->d_rtbcount, ARCH_CONVERT) >=
+                                INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT)) {
+                        if (!ddq->d_rtbtimer) {
+                                if (flags & XFS_QMOPT_DOWARN)
+                                        cmn_err(CE_ALERT,
+                                        "%s : Dquot ID 0x%x (0x%p) "
+                                        "RTBLK TIMER NOT STARTED",
+                                        str, (int)
+                                        INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
+                                errs++;
+                        }
+                }
+        }
+        if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+                return errs;
+        if (flags & XFS_QMOPT_DOWARN)
+                cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+        /*
+         * Typically, a repair is only requested by quotacheck.
+         */
+        ASSERT(id != -1);
+        ASSERT(flags & XFS_QMOPT_DQREPAIR);
+        memset(d, 0, sizeof(xfs_dqblk_t));
+        INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC);
+        INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION);
+        INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id);
+        INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type);
+        return errs;
+}
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ */
+STATIC void
+xlog_recover_do_dquot_buffer(
+        xfs_mount_t             *mp,
+        xlog_t                  *log,
+        xlog_recover_item_t     *item,
+        xfs_buf_t               *bp,
+        xfs_buf_log_format_t    *buf_f)
+{
+        uint                    type;
+        /*
+         * Filesystems are required to send in quota flags at mount time.
+         */
+        if (mp->m_qflags == 0) {
+                return;
+        }
+        type = 0;
+        if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+                type |= XFS_DQ_USER;
+        if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+                type |= XFS_DQ_GROUP;
+        /*
+         * This type of quotas was turned off, so ignore this buffer
+         */
+        if (log->l_quotaoffs_flag & type)
+                return;
+        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+}
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently.  Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields.  This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery.  During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_do_buffer_trans(
+        xlog_t                  *log,
+        xlog_recover_item_t     *item,
+        int                     pass)
+{
+        xfs_buf_log_format_t    *buf_f;
+        xfs_buf_log_format_v1_t *obuf_f;
+        xfs_mount_t             *mp;
+        xfs_buf_t               *bp;
+        int                     error;
+        int                     cancel;
+        xfs_daddr_t             blkno;
+        int                     len;
+        ushort                  flags;
+        buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
+        if (pass == XLOG_RECOVER_PASS1) {
+                /*
+                 * In this pass we're only looking for buf items
+                 * with the XFS_BLI_CANCEL bit set.
+                 */
+                xlog_recover_do_buffer_pass1(log, buf_f);
+                return 0;
+        } else {
+                /*
+                 * In this pass we want to recover all the buffers
+                 * which have not been cancelled and are not
+                 * cancellation buffers themselves.  The routine
+                 * we call here will tell us whether or not to
+                 * continue with the replay of this buffer.
+                 */
+                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
+                if (cancel) {
+                        return 0;
+                }
+        }
+        switch (buf_f->blf_type) {
+        case XFS_LI_BUF:
+                blkno = buf_f->blf_blkno;
+                len = buf_f->blf_len;
+                flags = buf_f->blf_flags;
+                break;
+        case XFS_LI_6_1_BUF:
+        case XFS_LI_5_3_BUF:
+                obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+                blkno = obuf_f->blf_blkno;
+                len = obuf_f->blf_len;
+                flags = obuf_f->blf_flags;
+                break;
+        default:
+                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+                        "xfs_log_recover: unknown buffer type 0x%x, dev %s",
+                        buf_f->blf_type, XFS_BUFTARG_NAME(log->l_targ));
+                XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
+                                 XFS_ERRLEVEL_LOW, log->l_mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        mp = log->l_mp;
+        if (flags & XFS_BLI_INODE_BUF) {
+                bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
+                                                                XFS_BUF_LOCK);
+        } else {
+                bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
+        }
+        if (XFS_BUF_ISERROR(bp)) {
+                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+                                  bp, blkno);
+                error = XFS_BUF_GETERROR(bp);
+                xfs_buf_relse(bp);
+                return error;
+        }
+        error = 0;
+        if (flags & XFS_BLI_INODE_BUF) {
+                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+        } else if (flags & (XFS_BLI_UDQUOT_BUF | XFS_BLI_GDQUOT_BUF)) {
+                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+        } else {
+                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+        }
+        if (error)
+                return XFS_ERROR(error);
+        /*
+         * Perform delayed write on the buffer.  Asynchronous writes will be
+         * slower when taking into account all the buffers to be flushed.
+         *
+         * Also make sure that only inode buffers with good sizes stay in
+         * the buffer cache.  The kernel moves inodes in buffers of 1 block
+         * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
+         * buffers in the log can be a different size if the log was generated
+         * by an older kernel using unclustered inode buffers or a newer kernel
+         * running with a different inode cluster size.  Regardless, if the
+         * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
+         * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+         * the buffer out of the buffer cache so that the buffer won't
+         * overlap with future reads of those inodes.
+         */
+        if (XFS_DINODE_MAGIC ==
+            INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) &&
+            (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
+                        (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+                XFS_BUF_STALE(bp);
+                error = xfs_bwrite(mp, bp);
+        } else {
+                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                XFS_BUF_SET_FSPRIVATE(bp, mp);
+                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+                xfs_bdwrite(mp, bp);
+        }
+        return (error);
+}
+STATIC int
+xlog_recover_do_inode_trans(
+        xlog_t                  *log,
+        xlog_recover_item_t     *item,
+        int                     pass)
+{
+        xfs_inode_log_format_t  *in_f;
+        xfs_mount_t             *mp;
+        xfs_buf_t               *bp;
+        xfs_imap_t              imap;
+        xfs_dinode_t            *dip;
+        xfs_ino_t               ino;
+        int                     len;
+        xfs_caddr_t             src;
+        xfs_caddr_t             dest;
+        int                     error;
+        int                     attr_index;
+        uint                    fields;
+        xfs_dinode_core_t       *dicp;
+        if (pass == XLOG_RECOVER_PASS1) {
+                return 0;
+        }
+        in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+        ino = in_f->ilf_ino;
+        mp = log->l_mp;
+        if (ITEM_TYPE(item) == XFS_LI_INODE) {
+                imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
+                imap.im_len = in_f->ilf_len;
+                imap.im_boffset = in_f->ilf_boffset;
+        } else {
+                /*
+                 * It's an old inode format record.  We don't know where
+                 * its cluster is located on disk, and we can't allow
+                 * xfs_imap() to figure it out because the inode btrees
+                 * are not ready to be used.  Therefore do not pass the
+                 * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
+                 * us only the single block in which the inode lives
+                 * rather than its cluster, so we must make sure to
+                 * invalidate the buffer when we write it out below.
+                 */
+                imap.im_blkno = 0;
+                xfs_imap(log->l_mp, NULL, ino, &imap, 0);
+        }
+        /*
+         * Inode buffers can be freed, look out for it,
+         * and do not replay the inode.
+         */
+        if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0))
+                return 0;
+        bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
+                                                                XFS_BUF_LOCK);
+        if (XFS_BUF_ISERROR(bp)) {
+                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
+                                  bp, imap.im_blkno);
+                error = XFS_BUF_GETERROR(bp);
+                xfs_buf_relse(bp);
+                return error;
+        }
+        error = 0;
+        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        /*
+         * Make sure the place we're flushing out to really looks
+         * like an inode!
+         */
+        if (unlikely(INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC)) {
+                xfs_buf_relse(bp);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
+                        dip, bp, ino);
+                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
+        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
+                xfs_buf_relse(bp);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
+                        item, ino);
+                XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /* Skip replay when the on disk inode is newer than the log one */
+        if (dicp->di_flushiter <
+            INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)) {
+                /*
+                 * Deal with the wrap case, DI_MAX_FLUSH is less
+                 * than smaller numbers
+                 */
+                if ((INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)
+                                                        == DI_MAX_FLUSH) &&
+                    (dicp->di_flushiter < (DI_MAX_FLUSH>>1))) {
+                        /* do nothing */
+                } else {
+                        xfs_buf_relse(bp);
+                        return 0;
+                }
+        }
+        /* Take the opportunity to reset the flush iteration count */
+        dicp->di_flushiter = 0;
+        if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
+                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+                    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+                                         XFS_ERRLEVEL_LOW, mp, dicp);
+                        xfs_buf_relse(bp);
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                item, dip, bp, ino);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+        } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
+                if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+                    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
+                    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+                        XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+                                             XFS_ERRLEVEL_LOW, mp, dicp);
+                        xfs_buf_relse(bp);
+                        xfs_fs_cmn_err(CE_ALERT, mp,
+                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                item, dip, bp, ino);
+                        return XFS_ERROR(EFSCORRUPTED);
+                }
+        }
+        if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
+                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+                                     XFS_ERRLEVEL_LOW, mp, dicp);
+                xfs_buf_relse(bp);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+                        item, dip, bp, ino,
+                        dicp->di_nextents + dicp->di_anextents,
+                        dicp->di_nblocks);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
+                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+                                     XFS_ERRLEVEL_LOW, mp, dicp);
+                xfs_buf_relse(bp);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+                        item, dip, bp, ino, dicp->di_forkoff);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
+                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+                                     XFS_ERRLEVEL_LOW, mp, dicp);
+                xfs_buf_relse(bp);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
+                        item->ri_buf[1].i_len, item);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /* The core is in in-core format */
+        xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
+                              (xfs_dinode_core_t*)item->ri_buf[1].i_addr, -1);
+        /* the rest is in on-disk format */
+        if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+                memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
+                        item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
+                        item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
+        }
+        fields = in_f->ilf_fields;
+        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
+        case XFS_ILOG_DEV:
+                INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev);
+                break;
+        case XFS_ILOG_UUID:
+                dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+                break;
+        }
+        if (in_f->ilf_size == 2)
+                goto write_inode_buffer;
+        len = item->ri_buf[2].i_len;
+        src = item->ri_buf[2].i_addr;
+        ASSERT(in_f->ilf_size <= 4);
+        ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+        ASSERT(!(fields & XFS_ILOG_DFORK) ||
+               (len == in_f->ilf_dsize));
+        switch (fields & XFS_ILOG_DFORK) {
+        case XFS_ILOG_DDATA:
+        case XFS_ILOG_DEXT:
+                memcpy(&dip->di_u, src, len);
+                break;
+        case XFS_ILOG_DBROOT:
+                xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                                 &(dip->di_u.di_bmbt),
+                                 XFS_DFORK_DSIZE(dip, mp));
+                break;
+        default:
+                /*
+                 * There are no data fork flags set.
+                 */
+                ASSERT((fields & XFS_ILOG_DFORK) == 0);
+                break;
+        }
+        /*
+         * If we logged any attribute data, recover it.  There may or
+         * may not have been any other non-core data logged in this
+         * transaction.
+         */
+        if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+                if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+                        attr_index = 3;
+                } else {
+                        attr_index = 2;
+                }
+                len = item->ri_buf[attr_index].i_len;
+                src = item->ri_buf[attr_index].i_addr;
+                ASSERT(len == in_f->ilf_asize);
+                switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+                case XFS_ILOG_ADATA:
+                case XFS_ILOG_AEXT:
+                        dest = XFS_DFORK_APTR(dip);
+                        ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+                        memcpy(dest, src, len);
+                        break;
+                case XFS_ILOG_ABROOT:
+                        dest = XFS_DFORK_APTR(dip);
+                        xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                                         (xfs_bmdr_block_t*)dest,
+                                         XFS_DFORK_ASIZE(dip, mp));
+                        break;
+                default:
+                        xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+                        ASSERT(0);
+                        xfs_buf_relse(bp);
+                        return XFS_ERROR(EIO);
+                }
+        }
+write_inode_buffer:
+        if (ITEM_TYPE(item) == XFS_LI_INODE) {
+                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                XFS_BUF_SET_FSPRIVATE(bp, mp);
+                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+                xfs_bdwrite(mp, bp);
+        } else {
+                XFS_BUF_STALE(bp);
+                error = xfs_bwrite(mp, bp);
+        }
+        return (error);
+}
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_do_quotaoff_trans(
+        xlog_t                  *log,
+        xlog_recover_item_t     *item,
+        int                     pass)
+{
+        xfs_qoff_logformat_t    *qoff_f;
+        if (pass == XLOG_RECOVER_PASS2) {
+                return (0);
+        }
+        qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
+        ASSERT(qoff_f);
+        /*
+         * The logitem format's flag tells us if this was user quotaoff,
+         * group quotaoff or both.
+         */
+        if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+                log->l_quotaoffs_flag |= XFS_DQ_USER;
+        if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+                log->l_quotaoffs_flag |= XFS_DQ_GROUP;
+        return (0);
+}
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_do_dquot_trans(
+        xlog_t                  *log,
+        xlog_recover_item_t     *item,
+        int                     pass)
+{
+        xfs_mount_t             *mp;
+        xfs_buf_t               *bp;
+        struct xfs_disk_dquot   *ddq, *recddq;
+        int                     error;
+        xfs_dq_logformat_t      *dq_f;
+        uint                    type;
+        if (pass == XLOG_RECOVER_PASS1) {
+                return 0;
+        }
+        mp = log->l_mp;
+        /*
+         * Filesystems are required to send in quota flags at mount time.
+         */
+        if (mp->m_qflags == 0)
+                return (0);
+        recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
+        ASSERT(recddq);
+        /*
+         * This type of quotas was turned off, so ignore this record.
+         */
+        type = INT_GET(recddq->d_flags, ARCH_CONVERT) &
+                        (XFS_DQ_USER | XFS_DQ_GROUP);
+        ASSERT(type);
+        if (log->l_quotaoffs_flag & type)
+                return (0);
+        /*
+         * At this point we know that quota was _not_ turned off.
+         * Since the mount flags are not indicating to us otherwise, this
+         * must mean that quota is on, and the dquot needs to be replayed.
+         * Remember that we may not have fully recovered the superblock yet,
+         * so we can't do the usual trick of looking at the SB quota bits.
+         *
+         * The other possibility, of course, is that the quota subsystem was
+         * removed since the last mount - ENOSYS.
+         */
+        dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
+        ASSERT(dq_f);
+        if ((error = xfs_qm_dqcheck(recddq,
+                           dq_f->qlf_id,
+                           0, XFS_QMOPT_DOWARN,
+                           "xlog_recover_do_dquot_trans (log copy)"))) {
+                return XFS_ERROR(EIO);
+        }
+        ASSERT(dq_f->qlf_len == 1);
+        error = xfs_read_buf(mp, mp->m_ddev_targp,
+                             dq_f->qlf_blkno,
+                             XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+                             0, &bp);
+        if (error) {
+                xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
+                                  bp, dq_f->qlf_blkno);
+                return error;
+        }
+        ASSERT(bp);
+        ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+        /*
+         * At least the magic num portion should be on disk because this
+         * was among a chunk of dquots created earlier, and we did some
+         * minimal initialization then.
+         */
+        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+                           "xlog_recover_do_dquot_trans")) {
+                xfs_buf_relse(bp);
+                return XFS_ERROR(EIO);
+        }
+        memcpy(ddq, recddq, item->ri_buf[1].i_len);
+        ASSERT(dq_f->qlf_size == 2);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+               XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+        XFS_BUF_SET_FSPRIVATE(bp, mp);
+        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+        xfs_bdwrite(mp, bp);
+        return (0);
+}
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC void
+xlog_recover_do_efi_trans(
+        xlog_t                  *log,
+        xlog_recover_item_t     *item,
+        xfs_lsn_t               lsn,
+        int                     pass)
+{
+        xfs_mount_t             *mp;
+        xfs_efi_log_item_t      *efip;
+        xfs_efi_log_format_t    *efi_formatp;
+        SPLDECL(s);
+        if (pass == XLOG_RECOVER_PASS1) {
+                return;
+        }
+        efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
+        ASSERT(item->ri_buf[0].i_len ==
+               (sizeof(xfs_efi_log_format_t) +
+                ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
+        mp = log->l_mp;
+        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+        memcpy((char *)&(efip->efi_format), (char *)efi_formatp,
+              sizeof(xfs_efi_log_format_t) +
+              ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
+        efip->efi_next_extent = efi_formatp->efi_nextents;
+        efip->efi_flags |= XFS_EFI_COMMITTED;
+        AIL_LOCK(mp,s);
+        /*
+         * xfs_trans_update_ail() drops the AIL lock.
+         */
+        xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
+}
+/*
+ * This routine is called when an efd format structure is found in
+ * a committed transaction in the log.  It's purpose is to cancel
+ * the corresponding efi if it was still in the log.  To do this
+ * it searches the AIL for the efi with an id equal to that in the
+ * efd format structure.  If we find it, we remove the efi from the
+ * AIL and free it.
+ */
+STATIC void
+xlog_recover_do_efd_trans(
+        xlog_t                  *log,
+        xlog_recover_item_t     *item,
+        int                     pass)
+{
+        xfs_mount_t             *mp;
+        xfs_efd_log_format_t    *efd_formatp;
+        xfs_efi_log_item_t      *efip = NULL;
+        xfs_log_item_t          *lip;
+        int                     gen;
+        int                     nexts;
+        __uint64_t              efi_id;
+        SPLDECL(s);
+        if (pass == XLOG_RECOVER_PASS1) {
+                return;
+        }
+        efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+        ASSERT(item->ri_buf[0].i_len ==
+               (sizeof(xfs_efd_log_format_t) +
+                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
+        efi_id = efd_formatp->efd_efi_id;
+        /*
+         * Search for the efi with the id in the efd format structure
+         * in the AIL.
+         */
+        mp = log->l_mp;
+        AIL_LOCK(mp,s);
+        lip = xfs_trans_first_ail(mp, &gen);
+        while (lip != NULL) {
+                if (lip->li_type == XFS_LI_EFI) {
+                        efip = (xfs_efi_log_item_t *)lip;
+                        if (efip->efi_format.efi_id == efi_id) {
+                                /*
+                                 * xfs_trans_delete_ail() drops the
+                                 * AIL lock.
+                                 */
+                                xfs_trans_delete_ail(mp, lip, s);
+                                break;
+                        }
+                }
+                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+        }
+        if (lip == NULL) {
+                AIL_UNLOCK(mp, s);
+        }
+        /*
+         * If we found it, then free it up.  If it wasn't there, it
+         * must have been overwritten in the log.  Oh well.
+         */
+        if (lip != NULL) {
+                nexts = efip->efi_format.efi_nextents;
+                if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+                        kmem_free(lip, sizeof(xfs_efi_log_item_t) +
+                                  ((nexts - 1) * sizeof(xfs_extent_t)));
+                } else {
+                        kmem_zone_free(xfs_efi_zone, efip);
+                }
+        }
+}
+/*
+ * Perform the transaction
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_do_trans(
+        xlog_t                  *log,
+        xlog_recover_t          *trans,
+        int                     pass)
+{
+        int                     error = 0;
+        xlog_recover_item_t     *item, *first_item;
+        if ((error = xlog_recover_reorder_trans(log, trans)))
+                return error;
+        first_item = item = trans->r_itemq;
+        do {
+                /*
+                 * we don't need to worry about the block number being
+                 * truncated in > 1 TB buffers because in user-land,
+                 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
+                 * the blkno's will get through the user-mode buffer
+                 * cache properly.  The only bad case is o32 kernels
+                 * where xfs_daddr_t is 32-bits but mount will warn us
+                 * off a > 1 TB filesystem before we get here.
+                 */
+                if ((ITEM_TYPE(item) == XFS_LI_BUF) ||
+                    (ITEM_TYPE(item) == XFS_LI_6_1_BUF) ||
+                    (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) {
+                        if  ((error = xlog_recover_do_buffer_trans(log, item,
+                                                                 pass)))
+                                break;
+                } else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
+                           (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
+                           (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
+                        if ((error = xlog_recover_do_inode_trans(log, item,
+                                                                pass)))
+                                break;
+                } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
+                        xlog_recover_do_efi_trans(log, item, trans->r_lsn,
+                                                  pass);
+                } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+                        xlog_recover_do_efd_trans(log, item, pass);
+                } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
+                        if ((error = xlog_recover_do_dquot_trans(log, item,
+                                                                   pass)))
+                                        break;
+                } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
+                        if ((error = xlog_recover_do_quotaoff_trans(log, item,
+                                                                   pass)))
+                                        break;
+                } else {
+                        xlog_warn("XFS: xlog_recover_do_trans");
+                        ASSERT(0);
+                        error = XFS_ERROR(EIO);
+                        break;
+                }
+                item = item->ri_next;
+        } while (first_item != item);
+        return error;
+}
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(
+        xlog_recover_t          *trans)
+{
+        xlog_recover_item_t     *first_item, *item, *free_item;
+        int                     i;
+        item = first_item = trans->r_itemq;
+        do {
+                free_item = item;
+                item = item->ri_next;
+                 /* Free the regions in the item. */
+                for (i = 0; i < free_item->ri_cnt; i++) {
+                        kmem_free(free_item->ri_buf[i].i_addr,
+                                  free_item->ri_buf[i].i_len);
+                }
+                /* Free the item itself */
+                kmem_free(free_item->ri_buf,
+                          (free_item->ri_total * sizeof(xfs_log_iovec_t)));
+                kmem_free(free_item, sizeof(xlog_recover_item_t));
+        } while (first_item != item);
+        /* Free the transaction recover structure */
+        kmem_free(trans, sizeof(xlog_recover_t));
+}
+STATIC int
+xlog_recover_commit_trans(
+        xlog_t                  *log,
+        xlog_recover_t          **q,
+        xlog_recover_t          *trans,
+        int                     pass)
+{
+        int                     error;
+        if ((error = xlog_recover_unlink_tid(q, trans)))
+                return error;
+        if ((error = xlog_recover_do_trans(log, trans, pass)))
+                return error;
+        xlog_recover_free_trans(trans);                 /* no error */
+        return 0;
+}
+STATIC int
+xlog_recover_unmount_trans(
+        xlog_recover_t          *trans)
+{
+        /* Do nothing now */
+        xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+        return 0;
+}
+/*
+ * There are two valid states of the r_state field.  0 indicates that the
+ * transaction structure is in a normal state.  We have either seen the
+ * start of the transaction or the last operation we added was not a partial
+ * operation.  If the last operation we added to the transaction was a
+ * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
+ *
+ * NOTE: skip LRs with 0 data length.
+ */
+STATIC int
+xlog_recover_process_data(
+        xlog_t                  *log,
+        xlog_recover_t          *rhash[],
+        xlog_rec_header_t       *rhead,
+        xfs_caddr_t             dp,
+        int                     pass)
+{
+        xfs_caddr_t             lp;
+        int                     num_logops;
+        xlog_op_header_t        *ohead;
+        xlog_recover_t          *trans;
+        xlog_tid_t              tid;
+        int                     error;
+        unsigned long           hash;
+        uint                    flags;
+        lp = dp + INT_GET(rhead->h_len, ARCH_CONVERT);
+        num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
+        /* check the log format matches our own - else we can't recover */
+        if (xlog_header_check_recover(log->l_mp, rhead))
+                return (XFS_ERROR(EIO));
+        while ((dp < lp) && num_logops) {
+                ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
+                ohead = (xlog_op_header_t *)dp;
+                dp += sizeof(xlog_op_header_t);
+                if (ohead->oh_clientid != XFS_TRANSACTION &&
+                    ohead->oh_clientid != XFS_LOG) {
+                        xlog_warn(
+                "XFS: xlog_recover_process_data: bad clientid");
+                        ASSERT(0);
+                        return (XFS_ERROR(EIO));
+                }
+                tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
+                hash = XLOG_RHASH(tid);
+                trans = xlog_recover_find_tid(rhash[hash], tid);
+                if (trans == NULL) {               /* not found; add new tid */
+                        if (ohead->oh_flags & XLOG_START_TRANS)
+                                xlog_recover_new_tid(&rhash[hash], tid,
+                                        INT_GET(rhead->h_lsn, ARCH_CONVERT));
+                } else {
+                        ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
+                        flags = ohead->oh_flags & ~XLOG_END_TRANS;
+                        if (flags & XLOG_WAS_CONT_TRANS)
+                                flags &= ~XLOG_CONTINUE_TRANS;
+                        switch (flags) {
+                        case XLOG_COMMIT_TRANS:
+                                error = xlog_recover_commit_trans(log,
+                                                &rhash[hash], trans, pass);
+                                break;
+                        case XLOG_UNMOUNT_TRANS:
+                                error = xlog_recover_unmount_trans(trans);
+                                break;
+                        case XLOG_WAS_CONT_TRANS:
+                                error = xlog_recover_add_to_cont_trans(trans,
+                                                dp, INT_GET(ohead->oh_len,
+                                                        ARCH_CONVERT));
+                                break;
+                        case XLOG_START_TRANS:
+                                xlog_warn(
+                        "XFS: xlog_recover_process_data: bad transaction");
+                                ASSERT(0);
+                                error = XFS_ERROR(EIO);
+                                break;
+                        case 0:
+                        case XLOG_CONTINUE_TRANS:
+                                error = xlog_recover_add_to_trans(trans,
+                                                dp, INT_GET(ohead->oh_len,
+                                                        ARCH_CONVERT));
+                                break;
+                        default:
+                                xlog_warn(
+                        "XFS: xlog_recover_process_data: bad flag");
+                                ASSERT(0);
+                                error = XFS_ERROR(EIO);
+                                break;
+                        }
+                        if (error)
+                                return error;
+                }
+                dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
+                num_logops--;
+        }
+        return 0;
+}
+/*
+ * Process an extent free intent item that was recovered from
+ * the log.  We need to free the extents that it describes.
+ */
+STATIC void
+xlog_recover_process_efi(
+        xfs_mount_t             *mp,
+        xfs_efi_log_item_t      *efip)
+{
+        xfs_efd_log_item_t      *efdp;
+        xfs_trans_t             *tp;
+        int                     i;
+        xfs_extent_t            *extp;
+        xfs_fsblock_t           startblock_fsb;
+        ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+        /*
+         * First check the validity of the extents described by the
+         * EFI.  If any are bad, then assume that all are bad and
+         * just toss the EFI.
+         */
+        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+                extp = &(efip->efi_format.efi_extents[i]);
+                startblock_fsb = XFS_BB_TO_FSB(mp,
+                                   XFS_FSB_TO_DADDR(mp, extp->ext_start));
+                if ((startblock_fsb == 0) ||
+                    (extp->ext_len == 0) ||
+                    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
+                    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
+                        /*
+                         * This will pull the EFI from the AIL and
+                         * free the memory associated with it.
+                         */
+                        xfs_efi_release(efip, efip->efi_format.efi_nextents);
+                        return;
+                }
+        }
+        tp = xfs_trans_alloc(mp, 0);
+        xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+        efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+        for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+                extp = &(efip->efi_format.efi_extents[i]);
+                xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+                xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
+                                         extp->ext_len);
+        }
+        efip->efi_flags |= XFS_EFI_RECOVERED;
+        xfs_trans_commit(tp, 0, NULL);
+}
+/*
+ * Verify that once we've encountered something other than an EFI
+ * in the AIL that there are no more EFIs in the AIL.
+ */
+#if defined(DEBUG)
+STATIC void
+xlog_recover_check_ail(
+        xfs_mount_t             *mp,
+        xfs_log_item_t          *lip,
+        int                     gen)
+{
+        int                     orig_gen = gen;
+        do {
+                ASSERT(lip->li_type != XFS_LI_EFI);
+                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                /*
+                 * The check will be bogus if we restart from the
+                 * beginning of the AIL, so ASSERT that we don't.
+                 * We never should since we're holding the AIL lock
+                 * the entire time.
+                 */
+                ASSERT(gen == orig_gen);
+        } while (lip != NULL);
+}
+#endif  /* DEBUG */
+/*
+ * When this is called, all of the EFIs which did not have
+ * corresponding EFDs should be in the AIL.  What we do now
+ * is free the extents associated with each one.
+ *
+ * Since we process the EFIs in normal transactions, they
+ * will be removed at some point after the commit.  This prevents
+ * us from just walking down the list processing each one.
+ * We'll use a flag in the EFI to skip those that we've already
+ * processed and use the AIL iteration mechanism's generation
+ * count to try to speed this up at least a bit.
+ *
+ * When we start, we know that the EFIs are the only things in
+ * the AIL.  As we process them, however, other items are added
+ * to the AIL.  Since everything added to the AIL must come after
+ * everything already in the AIL, we stop processing as soon as
+ * we see something other than an EFI in the AIL.
+ */
+STATIC void
+xlog_recover_process_efis(
+        xlog_t                  *log)
+{
+        xfs_log_item_t          *lip;
+        xfs_efi_log_item_t      *efip;
+        int                     gen;
+        xfs_mount_t             *mp;
+        SPLDECL(s);
+        mp = log->l_mp;
+        AIL_LOCK(mp,s);
+        lip = xfs_trans_first_ail(mp, &gen);
+        while (lip != NULL) {
+                /*
+                 * We're done when we see something other than an EFI.
+                 */
+                if (lip->li_type != XFS_LI_EFI) {
+                        xlog_recover_check_ail(mp, lip, gen);
+                        break;
+                }
+                /*
+                 * Skip EFIs that we've already processed.
+                 */
+                efip = (xfs_efi_log_item_t *)lip;
+                if (efip->efi_flags & XFS_EFI_RECOVERED) {
+                        lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                        continue;
+                }
+                AIL_UNLOCK(mp, s);
+                xlog_recover_process_efi(mp, efip);
+                AIL_LOCK(mp,s);
+                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+        }
+        AIL_UNLOCK(mp, s);
+}
+/*
+ * This routine performs a transaction to null out a bad inode pointer
+ * in an agi unlinked inode hash bucket.
+ */
+STATIC void
+xlog_recover_clear_agi_bucket(
+        xfs_mount_t     *mp,
+        xfs_agnumber_t  agno,
+        int             bucket)
+{
+        xfs_trans_t     *tp;
+        xfs_agi_t       *agi;
+        xfs_buf_t       *agibp;
+        int             offset;
+        int             error;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
+        xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+        if (error) {
+                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+                return;
+        }
+        agi = XFS_BUF_TO_AGI(agibp);
+        if (INT_GET(agi->agi_magicnum, ARCH_CONVERT) != XFS_AGI_MAGIC) {
+                xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+                return;
+        }
+        ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+        INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, NULLAGINO);
+        offset = offsetof(xfs_agi_t, agi_unlinked) +
+                 (sizeof(xfs_agino_t) * bucket);
+        xfs_trans_log_buf(tp, agibp, offset,
+                          (offset + sizeof(xfs_agino_t) - 1));
+        (void) xfs_trans_commit(tp, 0, NULL);
+}
+/*
+ * xlog_iunlink_recover
+ *
+ * This is called during recovery to process any inodes which
+ * we unlinked but not freed when the system crashed.  These
+ * inodes will be on the lists in the AGI blocks.  What we do
+ * here is scan all the AGIs and fully truncate and free any
+ * inodes found on the lists.  Each inode is removed from the
+ * lists when it has been fully truncated and is freed.  The
+ * freeing of the inode and its removal from the list must be
+ * atomic.
+ */
+void
+xlog_recover_process_iunlinks(
+        xlog_t          *log)
+{
+        xfs_mount_t     *mp;
+        xfs_agnumber_t  agno;
+        xfs_agi_t       *agi;
+        xfs_buf_t       *agibp;
+        xfs_buf_t       *ibp;
+        xfs_dinode_t    *dip;
+        xfs_inode_t     *ip;
+        xfs_agino_t     agino;
+        xfs_ino_t       ino;
+        int             bucket;
+        int             error;
+        uint            mp_dmevmask;
+        mp = log->l_mp;
+        /*
+         * Prevent any DMAPI event from being sent while in this function.
+         */
+        mp_dmevmask = mp->m_dmevmask;
+        mp->m_dmevmask = 0;
+        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+                /*
+                 * Find the agi for this ag.
+                 */
+                agibp = xfs_buf_read(mp->m_ddev_targp,
+                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                                XFS_FSS_TO_BB(mp, 1), 0);
+                if (XFS_BUF_ISERROR(agibp)) {
+                        xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
+                                log->l_mp, agibp,
+                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
+                }
+                agi = XFS_BUF_TO_AGI(agibp);
+                ASSERT(XFS_AGI_MAGIC ==
+                        INT_GET(agi->agi_magicnum, ARCH_CONVERT));
+                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+                        agino = INT_GET(agi->agi_unlinked[bucket], ARCH_CONVERT);
+                        while (agino != NULLAGINO) {
+                                /*
+                                 * Release the agi buffer so that it can
+                                 * be acquired in the normal course of the
+                                 * transaction to truncate and free the inode.
+                                 */
+                                xfs_buf_relse(agibp);
+                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
+                                error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+                                ASSERT(error || (ip != NULL));
+                                if (!error) {
+                                        /*
+                                         * Get the on disk inode to find the
+                                         * next inode in the bucket.
+                                         */
+                                        error = xfs_itobp(mp, NULL, ip, &dip,
+                                                        &ibp, 0);
+                                        ASSERT(error || (dip != NULL));
+                                }
+                                if (!error) {
+                                        ASSERT(ip->i_d.di_nlink == 0);
+                                        /* setup for the next pass */
+                                        agino = INT_GET(dip->di_next_unlinked,
+                                                        ARCH_CONVERT);
+                                        xfs_buf_relse(ibp);
+                                        /*
+                                         * Prevent any DMAPI event from
+                                         * being sent when the
+                                         * reference on the inode is
+                                         * dropped.
+                                         */
+                                        ip->i_d.di_dmevmask = 0;
+                                        /*
+                                         * If this is a new inode, handle
+                                         * it specially.  Otherwise,
+                                         * just drop our reference to the
+                                         * inode.  If there are no
+                                         * other references, this will
+                                         * send the inode to
+                                         * xfs_inactive() which will
+                                         * truncate the file and free
+                                         * the inode.
+                                         */
+                                        if (ip->i_d.di_mode == 0)
+                                                xfs_iput_new(ip, 0);
+                                        else
+                                                VN_RELE(XFS_ITOV(ip));
+                                } else {
+                                        /*
+                                         * We can't read in the inode
+                                         * this bucket points to, or
+                                         * this inode is messed up.  Just
+                                         * ditch this bucket of inodes.  We
+                                         * will lose some inodes and space,
+                                         * but at least we won't hang.  Call
+                                         * xlog_recover_clear_agi_bucket()
+                                         * to perform a transaction to clear
+                                         * the inode pointer in the bucket.
+                                         */
+                                        xlog_recover_clear_agi_bucket(mp, agno,
+                                                        bucket);
+                                        agino = NULLAGINO;
+                                }
+                                /*
+                                 * Reacquire the agibuffer and continue around
+                                 * the loop.
+                                 */
+                                agibp = xfs_buf_read(mp->m_ddev_targp,
+                                                XFS_AG_DADDR(mp, agno,
+                                                        XFS_AGI_DADDR(mp)),
+                                                XFS_FSS_TO_BB(mp, 1), 0);
+                                if (XFS_BUF_ISERROR(agibp)) {
+                                        xfs_ioerror_alert(
+                                "xlog_recover_process_iunlinks(#2)",
+                                                log->l_mp, agibp,
+                                                XFS_AG_DADDR(mp, agno,
+                                                        XFS_AGI_DADDR(mp)));
+                                }
+                                agi = XFS_BUF_TO_AGI(agibp);
+                                ASSERT(XFS_AGI_MAGIC == INT_GET(
+                                        agi->agi_magicnum, ARCH_CONVERT));
+                        }
+                }
+                /*
+                 * Release the buffer for the current agi so we can
+                 * go on to the next one.
+                 */
+                xfs_buf_relse(agibp);
+        }
+        mp->m_dmevmask = mp_dmevmask;
+}
+#ifdef DEBUG
+STATIC void
+xlog_pack_data_checksum(
+        xlog_t          *log,
+        xlog_in_core_t  *iclog,
+        int             size)
+{
+        int             i;
+        uint            *up;
+        uint            chksum = 0;
+        up = (uint *)iclog->ic_datap;
+        /* divide length by 4 to get # words */
+        for (i = 0; i < (size >> 2); i++) {
+                chksum ^= INT_GET(*up, ARCH_CONVERT);
+                up++;
+        }
+        INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum);
+}
+#else
+#define xlog_pack_data_checksum(log, iclog, size)
+#endif
+/*
+ * Stamp cycle number in every block
+ */
+void
+xlog_pack_data(
+        xlog_t                  *log,
+        xlog_in_core_t          *iclog,
+        int                     roundoff)
+{
+        int                     i, j, k;
+        int                     size = iclog->ic_offset + roundoff;
+        uint                    cycle_lsn;
+        xfs_caddr_t             dp;
+        xlog_in_core_2_t        *xhdr;
+        xlog_pack_data_checksum(log, iclog, size);
+        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+        dp = iclog->ic_datap;
+        for (i = 0; i < BTOBB(size) &&
+                i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+                iclog->ic_header.h_cycle_data[i] = *(uint *)dp;
+                *(uint *)dp = cycle_lsn;
+                dp += BBSIZE;
+        }
+        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+                xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+                for ( ; i < BTOBB(size); i++) {
+                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp;
+                        *(uint *)dp = cycle_lsn;
+                        dp += BBSIZE;
+                }
+                for (i = 1; i < log->l_iclog_heads; i++) {
+                        xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+                }
+        }
+}
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+STATIC void
+xlog_unpack_data_checksum(
+        xlog_rec_header_t       *rhead,
+        xfs_caddr_t             dp,
+        xlog_t                  *log)
+{
+        uint                    *up = (uint *)dp;
+        uint                    chksum = 0;
+        int                     i;
+        /* divide length by 4 to get # words */
+        for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
+                chksum ^= INT_GET(*up, ARCH_CONVERT);
+                up++;
+        }
+        if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
+            if (rhead->h_chksum ||
+                ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
+                    cmn_err(CE_DEBUG,
+                        "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
+                            INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
+                    cmn_err(CE_DEBUG,
+"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
+                    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+                            cmn_err(CE_DEBUG,
+                                "XFS: LogR this is a LogV2 filesystem");
+                    }
+                    log->l_flags |= XLOG_CHKSUM_MISMATCH;
+            }
+        }
+}
+#else
+#define xlog_unpack_data_checksum(rhead, dp, log)
+#endif
+STATIC void
+xlog_unpack_data(
+        xlog_rec_header_t       *rhead,
+        xfs_caddr_t             dp,
+        xlog_t                  *log)
+{
+        int                     i, j, k;
+        xlog_in_core_2_t        *xhdr;
+        for (i = 0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) &&
+                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+                *(uint *)dp = *(uint *)&rhead->h_cycle_data[i];
+                dp += BBSIZE;
+        }
+        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+                xhdr = (xlog_in_core_2_t *)rhead;
+                for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
+                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        *(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+                        dp += BBSIZE;
+                }
+        }
+        xlog_unpack_data_checksum(rhead, dp, log);
+}
+STATIC int
+xlog_valid_rec_header(
+        xlog_t                  *log,
+        xlog_rec_header_t       *rhead,
+        xfs_daddr_t             blkno)
+{
+        int                     hlen;
+        if (unlikely(
+            (INT_GET(rhead->h_magicno, ARCH_CONVERT) !=
+                        XLOG_HEADER_MAGIC_NUM))) {
+                XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
+                                XFS_ERRLEVEL_LOW, log->l_mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (unlikely(
+            (!rhead->h_version ||
+            (INT_GET(rhead->h_version, ARCH_CONVERT) &
+                        (~XLOG_VERSION_OKBITS)) != 0))) {
+                xlog_warn("XFS: %s: unrecognised log version (%d).",
+                        __FUNCTION__, INT_GET(rhead->h_version, ARCH_CONVERT));
+                return XFS_ERROR(EIO);
+        }
+        /* LR body must have data or it wouldn't have been written */
+        hlen = INT_GET(rhead->h_len, ARCH_CONVERT);
+        if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
+                XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
+                                XFS_ERRLEVEL_LOW, log->l_mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
+                XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
+                                XFS_ERRLEVEL_LOW, log->l_mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        return 0;
+}
+/*
+ * Read the log from tail to head and process the log records found.
+ * Handle the two cases where the tail and head are in the same cycle
+ * and where the active portion of the log wraps around the end of
+ * the physical log separately.  The pass parameter is passed through
+ * to the routines called to process the data and is not looked at
+ * here.
+ */
+STATIC int
+xlog_do_recovery_pass(
+        xlog_t                  *log,
+        xfs_daddr_t             head_blk,
+        xfs_daddr_t             tail_blk,
+        int                     pass)
+{
+        xlog_rec_header_t       *rhead;
+        xfs_daddr_t             blk_no;
+        xfs_caddr_t             bufaddr, offset;
+        xfs_buf_t               *hbp, *dbp;
+        int                     error = 0, h_size;
+        int                     bblks, split_bblks;
+        int                     hblks, split_hblks, wrapped_hblks;
+        xlog_recover_t          *rhash[XLOG_RHASH_SIZE];
+        ASSERT(head_blk != tail_blk);
+        /*
+         * Read the header of the tail block and get the iclog buffer size from
+         * h_size.  Use this to tell how many sectors make up the log header.
+         */
+        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+                /*
+                 * When using variable length iclogs, read first sector of
+                 * iclog header and extract the header size from it.  Get a
+                 * new hbp that is the correct size.
+                 */
+                hbp = xlog_get_bp(log, 1);
+                if (!hbp)
+                        return ENOMEM;
+                if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+                        goto bread_err1;
+                offset = xlog_align(log, tail_blk, 1, hbp);
+                rhead = (xlog_rec_header_t *)offset;
+                error = xlog_valid_rec_header(log, rhead, tail_blk);
+                if (error)
+                        goto bread_err1;
+                h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
+                if ((INT_GET(rhead->h_version, ARCH_CONVERT)
+                                & XLOG_VERSION_2) &&
+                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
+                                hblks++;
+                        xlog_put_bp(hbp);
+                        hbp = xlog_get_bp(log, hblks);
+                } else {
+                        hblks = 1;
+                }
+        } else {
+                ASSERT(log->l_sectbb_log == 0);
+                hblks = 1;
+                hbp = xlog_get_bp(log, 1);
+                h_size = XLOG_BIG_RECORD_BSIZE;
+        }
+        if (!hbp)
+                return ENOMEM;
+        dbp = xlog_get_bp(log, BTOBB(h_size));
+        if (!dbp) {
+                xlog_put_bp(hbp);
+                return ENOMEM;
+        }
+        memset(rhash, 0, sizeof(rhash));
+        if (tail_blk <= head_blk) {
+                for (blk_no = tail_blk; blk_no < head_blk; ) {
+                        if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                                goto bread_err2;
+                        offset = xlog_align(log, blk_no, hblks, hbp);
+                        rhead = (xlog_rec_header_t *)offset;
+                        error = xlog_valid_rec_header(log, rhead, blk_no);
+                        if (error)
+                                goto bread_err2;
+                        /* blocks in data section */
+                        bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
+                        error = xlog_bread(log, blk_no + hblks, bblks, dbp);
+                        if (error)
+                                goto bread_err2;
+                        offset = xlog_align(log, blk_no + hblks, bblks, dbp);
+                        xlog_unpack_data(rhead, offset, log);
+                        if ((error = xlog_recover_process_data(log,
+                                                rhash, rhead, offset, pass)))
+                                goto bread_err2;
+                        blk_no += bblks + hblks;
+                }
+        } else {
+                /*
+                 * Perform recovery around the end of the physical log.
+                 * When the head is not on the same cycle number as the tail,
+                 * we can't do a sequential recovery as above.
+                 */
+                blk_no = tail_blk;
+                while (blk_no < log->l_logBBsize) {
+                        /*
+                         * Check for header wrapping around physical end-of-log
+                         */
+                        offset = NULL;
+                        split_hblks = 0;
+                        wrapped_hblks = 0;
+                        if (blk_no + hblks <= log->l_logBBsize) {
+                                /* Read header in one read */
+                                error = xlog_bread(log, blk_no, hblks, hbp);
+                                if (error)
+                                        goto bread_err2;
+                                offset = xlog_align(log, blk_no, hblks, hbp);
+                        } else {
+                                /* This LR is split across physical log end */
+                                if (blk_no != log->l_logBBsize) {
+                                        /* some data before physical log end */
+                                        ASSERT(blk_no <= INT_MAX);
+                                        split_hblks = log->l_logBBsize - (int)blk_no;
+                                        ASSERT(split_hblks > 0);
+                                        if ((error = xlog_bread(log, blk_no,
+                                                        split_hblks, hbp)))
+                                                goto bread_err2;
+                                        offset = xlog_align(log, blk_no,
+                                                        split_hblks, hbp);
+                                }
+                                /*
+                                 * Note: this black magic still works with
+                                 * large sector sizes (non-512) only because:
+                                 * - we increased the buffer size originally
+                                 *   by 1 sector giving us enough extra space
+                                 *   for the second read;
+                                 * - the log start is guaranteed to be sector
+                                 *   aligned;
+                                 * - we read the log end (LR header start)
+                                 *   _first_, then the log start (LR header end)
+                                 *   - order is important.
+                                 */
+                                bufaddr = XFS_BUF_PTR(hbp);
+                                XFS_BUF_SET_PTR(hbp,
+                                                bufaddr + BBTOB(split_hblks),
+                                                BBTOB(hblks - split_hblks));
+                                wrapped_hblks = hblks - split_hblks;
+                                error = xlog_bread(log, 0, wrapped_hblks, hbp);
+                                if (error)
+                                        goto bread_err2;
+                                XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
+                                if (!offset)
+                                        offset = xlog_align(log, 0,
+                                                        wrapped_hblks, hbp);
+                        }
+                        rhead = (xlog_rec_header_t *)offset;
+                        error = xlog_valid_rec_header(log, rhead,
+                                                split_hblks ? blk_no : 0);
+                        if (error)
+                                goto bread_err2;
+                        bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
+                        blk_no += hblks;
+                        /* Read in data for log record */
+                        if (blk_no + bblks <= log->l_logBBsize) {
+                                error = xlog_bread(log, blk_no, bblks, dbp);
+                                if (error)
+                                        goto bread_err2;
+                                offset = xlog_align(log, blk_no, bblks, dbp);
+                        } else {
+                                /* This log record is split across the
+                                 * physical end of log */
+                                offset = NULL;
+                                split_bblks = 0;
+                                if (blk_no != log->l_logBBsize) {
+                                        /* some data is before the physical
+                                         * end of log */
+                                        ASSERT(!wrapped_hblks);
+                                        ASSERT(blk_no <= INT_MAX);
+                                        split_bblks =
+                                                log->l_logBBsize - (int)blk_no;
+                                        ASSERT(split_bblks > 0);
+                                        if ((error = xlog_bread(log, blk_no,
+                                                        split_bblks, dbp)))
+                                                goto bread_err2;
+                                        offset = xlog_align(log, blk_no,
+                                                        split_bblks, dbp);
+                                }
+                                /*
+                                 * Note: this black magic still works with
+                                 * large sector sizes (non-512) only because:
+                                 * - we increased the buffer size originally
+                                 *   by 1 sector giving us enough extra space
+                                 *   for the second read;
+                                 * - the log start is guaranteed to be sector
+                                 *   aligned;
+                                 * - we read the log end (LR header start)
+                                 *   _first_, then the log start (LR header end)
+                                 *   - order is important.
+                                 */
+                                bufaddr = XFS_BUF_PTR(dbp);
+                                XFS_BUF_SET_PTR(dbp,
+                                                bufaddr + BBTOB(split_bblks),
+                                                BBTOB(bblks - split_bblks));
+                                if ((error = xlog_bread(log, wrapped_hblks,
+                                                bblks - split_bblks, dbp)))
+                                        goto bread_err2;
+                                XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
+                                if (!offset)
+                                        offset = xlog_align(log, wrapped_hblks,
+                                                bblks - split_bblks, dbp);
+                        }
+                        xlog_unpack_data(rhead, offset, log);
+                        if ((error = xlog_recover_process_data(log, rhash,
+                                                        rhead, offset, pass)))
+                                goto bread_err2;
+                        blk_no += bblks;
+                }
+                ASSERT(blk_no >= log->l_logBBsize);
+                blk_no -= log->l_logBBsize;
+                /* read first part of physical log */
+                while (blk_no < head_blk) {
+                        if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                                goto bread_err2;
+                        offset = xlog_align(log, blk_no, hblks, hbp);
+                        rhead = (xlog_rec_header_t *)offset;
+                        error = xlog_valid_rec_header(log, rhead, blk_no);
+                        if (error)
+                                goto bread_err2;
+                        bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
+                        if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+                                goto bread_err2;
+                        offset = xlog_align(log, blk_no+hblks, bblks, dbp);
+                        xlog_unpack_data(rhead, offset, log);
+                        if ((error = xlog_recover_process_data(log, rhash,
+                                                        rhead, offset, pass)))
+                                goto bread_err2;
+                        blk_no += bblks + hblks;
+                }
+        }
+ bread_err2:
+        xlog_put_bp(dbp);
+ bread_err1:
+        xlog_put_bp(hbp);
+        return error;
+}
+/*
+ * Do the recovery of the log.  We actually do this in two phases.
+ * The two passes are necessary in order to implement the function
+ * of cancelling a record written into the log.  The first pass
+ * determines those things which have been cancelled, and the
+ * second pass replays log items normally except for those which
+ * have been cancelled.  The handling of the replay and cancellations
+ * takes place in the log item type specific routines.
+ *
+ * The table of items which have cancel records in the log is allocated
+ * and freed at this level, since only here do we know when all of
+ * the log recovery has been completed.
+ */
+STATIC int
+xlog_do_log_recovery(
+        xlog_t          *log,
+        xfs_daddr_t     head_blk,
+        xfs_daddr_t     tail_blk)
+{
+        int             error;
+        ASSERT(head_blk != tail_blk);
+        /*
+         * First do a pass to find all of the cancelled buf log items.
+         * Store them in the buf_cancel_table for use in the second pass.
+         */
+        log->l_buf_cancel_table =
+                (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+                                                 sizeof(xfs_buf_cancel_t*),
+                                                 KM_SLEEP);
+        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+                                      XLOG_RECOVER_PASS1);
+        if (error != 0) {
+                kmem_free(log->l_buf_cancel_table,
+                          XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+                log->l_buf_cancel_table = NULL;
+                return error;
+        }
+        /*
+         * Then do a second pass to actually recover the items in the log.
+         * When it is complete free the table of buf cancel items.
+         */
+        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+                                      XLOG_RECOVER_PASS2);
+#ifdef DEBUG
+        {
+                int     i;
+                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+                        ASSERT(log->l_buf_cancel_table[i] == NULL);
+        }
+#endif  /* DEBUG */
+        kmem_free(log->l_buf_cancel_table,
+                  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+        log->l_buf_cancel_table = NULL;
+        return error;
+}
+/*
+ * Do the actual recovery
+ */
+STATIC int
+xlog_do_recover(
+        xlog_t          *log,
+        xfs_daddr_t     head_blk,
+        xfs_daddr_t     tail_blk)
+{
+        int             error;
+        xfs_buf_t       *bp;
+        xfs_sb_t        *sbp;
+        /*
+         * First replay the images in the log.
+         */
+        error = xlog_do_log_recovery(log, head_blk, tail_blk);
+        if (error) {
+                return error;
+        }
+        XFS_bflush(log->l_mp->m_ddev_targp);
+        /*
+         * If IO errors happened during recovery, bail out.
+         */
+        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+                return (EIO);
+        }
+        /*
+         * We now update the tail_lsn since much of the recovery has completed
+         * and there may be space available to use.  If there were no extent
+         * or iunlinks, we can free up the entire log and set the tail_lsn to
+         * be the last_sync_lsn.  This was set in xlog_find_tail to be the
+         * lsn of the last known good LR on disk.  If there are extent frees
+         * or iunlinks they will have some entries in the AIL; so we look at
+         * the AIL to determine how to set the tail_lsn.
+         */
+        xlog_assign_tail_lsn(log->l_mp);
+        /*
+         * Now that we've finished replaying all buffer and inode
+         * updates, re-read in the superblock.
+         */
+        bp = xfs_getsb(log->l_mp, 0);
+        XFS_BUF_UNDONE(bp);
+        XFS_BUF_READ(bp);
+        xfsbdstrat(log->l_mp, bp);
+        if ((error = xfs_iowait(bp))) {
+                xfs_ioerror_alert("xlog_do_recover",
+                                  log->l_mp, bp, XFS_BUF_ADDR(bp));
+                ASSERT(0);
+                xfs_buf_relse(bp);
+                return error;
+        }
+        /* Convert superblock from on-disk format */
+        sbp = &log->l_mp->m_sb;
+        xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, XFS_SB_ALL_BITS);
+        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
+        ASSERT(XFS_SB_GOOD_VERSION(sbp));
+        xfs_buf_relse(bp);
+        xlog_recover_check_summary(log);
+        /* Normal transactions can now occur */
+        log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+        return 0;
+}
+/*
+ * Perform recovery and re-initialize some log variables in xlog_find_tail.
+ *
+ * Return error or zero.
+ */
+int
+xlog_recover(
+        xlog_t          *log,
+        int             readonly)
+{
+        xfs_daddr_t     head_blk, tail_blk;
+        int             error;
+        /* find the tail of the log */
+        if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly)))
+                return error;
+        if (tail_blk != head_blk) {
+                /* There used to be a comment here:
+                 *
+                 * disallow recovery on read-only mounts.  note -- mount
+                 * checks for ENOSPC and turns it into an intelligent
+                 * error message.
+                 * ...but this is no longer true.  Now, unless you specify
+                 * NORECOVERY (in which case this function would never be
+                 * called), we just go ahead and recover.  We do this all
+                 * under the vfs layer, so we can get away with it unless
+                 * the device itself is read-only, in which case we fail.
+                 */
+                if ((error = xfs_dev_is_read_only(log->l_mp,
+                                                "recovery required"))) {
+                        return error;
+                }
+                cmn_err(CE_NOTE,
+                        "Starting XFS recovery on filesystem: %s (dev: %s)",
+                        log->l_mp->m_fsname, XFS_BUFTARG_NAME(log->l_targ));
+                error = xlog_do_recover(log, head_blk, tail_blk);
+                log->l_flags |= XLOG_RECOVERY_NEEDED;
+        }
+        return error;
+}
+/*
+ * In the first part of recovery we replay inodes and buffers and build
+ * up the list of extent free items which need to be processed.  Here
+ * we process the extent free items and clean up the on disk unlinked
+ * inode lists.  This is separated from the first part of recovery so
+ * that the root and real-time bitmap inodes can be read in from disk in
+ * between the two stages.  This is necessary so that we can free space
+ * in the real-time portion of the file system.
+ */
+int
+xlog_recover_finish(
+        xlog_t          *log,
+        int             mfsi_flags)
+{
+        /*
+         * Now we're ready to do the transactions needed for the
+         * rest of recovery.  Start with completing all the extent
+         * free intent records and then process the unlinked inode
+         * lists.  At this point, we essentially run in normal mode
+         * except that we're still performing recovery actions
+         * rather than accepting new requests.
+         */
+        if (log->l_flags & XLOG_RECOVERY_NEEDED) {
+                xlog_recover_process_efis(log);
+                /*
+                 * Sync the log to get all the EFIs out of the AIL.
+                 * This isn't absolutely necessary, but it helps in
+                 * case the unlink transactions would have problems
+                 * pushing the EFIs out of the way.
+                 */
+                xfs_log_force(log->l_mp, (xfs_lsn_t)0,
+                              (XFS_LOG_FORCE | XFS_LOG_SYNC));
+                if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
+                        xlog_recover_process_iunlinks(log);
+                }
+                xlog_recover_check_summary(log);
+                cmn_err(CE_NOTE,
+                        "Ending XFS recovery on filesystem: %s (dev: %s)",
+                        log->l_mp->m_fsname, XFS_BUFTARG_NAME(log->l_targ));
+                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
+        } else {
+                cmn_err(CE_DEBUG,
+                        "!Ending clean XFS mount for filesystem: %s",
+                        log->l_mp->m_fsname);
+        }
+        return 0;
+}
+#if defined(DEBUG)
+/*
+ * Read all of the agf and agi counters and check that they
+ * are consistent with the superblock counters.
+ */
+void
+xlog_recover_check_summary(
+        xlog_t          *log)
+{
+        xfs_mount_t     *mp;
+        xfs_agf_t       *agfp;
+        xfs_agi_t       *agip;
+        xfs_buf_t       *agfbp;
+        xfs_buf_t       *agibp;
+        xfs_daddr_t     agfdaddr;
+        xfs_daddr_t     agidaddr;
+        xfs_buf_t       *sbbp;
+#ifdef XFS_LOUD_RECOVERY
+        xfs_sb_t        *sbp;
+#endif
+        xfs_agnumber_t  agno;
+        __uint64_t      freeblks;
+        __uint64_t      itotal;
+        __uint64_t      ifree;
+        mp = log->l_mp;
+        freeblks = 0LL;
+        itotal = 0LL;
+        ifree = 0LL;
+        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+                agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
+                agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
+                                XFS_FSS_TO_BB(mp, 1), 0);
+                if (XFS_BUF_ISERROR(agfbp)) {
+                        xfs_ioerror_alert("xlog_recover_check_summary(agf)",
+                                                mp, agfbp, agfdaddr);
+                }
+                agfp = XFS_BUF_TO_AGF(agfbp);
+                ASSERT(XFS_AGF_MAGIC ==
+                        INT_GET(agfp->agf_magicnum, ARCH_CONVERT));
+                ASSERT(XFS_AGF_GOOD_VERSION(
+                        INT_GET(agfp->agf_versionnum, ARCH_CONVERT)));
+                ASSERT(INT_GET(agfp->agf_seqno, ARCH_CONVERT) == agno);
+                freeblks += INT_GET(agfp->agf_freeblks, ARCH_CONVERT) +
+                            INT_GET(agfp->agf_flcount, ARCH_CONVERT);
+                xfs_buf_relse(agfbp);
+                agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
+                agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
+                                XFS_FSS_TO_BB(mp, 1), 0);
+                if (XFS_BUF_ISERROR(agibp)) {
+                        xfs_ioerror_alert("xlog_recover_check_summary(agi)",
+                                          mp, agibp, agidaddr);
+                }
+                agip = XFS_BUF_TO_AGI(agibp);
+                ASSERT(XFS_AGI_MAGIC ==
+                        INT_GET(agip->agi_magicnum, ARCH_CONVERT));
+                ASSERT(XFS_AGI_GOOD_VERSION(
+                        INT_GET(agip->agi_versionnum, ARCH_CONVERT)));
+                ASSERT(INT_GET(agip->agi_seqno, ARCH_CONVERT) == agno);
+                itotal += INT_GET(agip->agi_count, ARCH_CONVERT);
+                ifree += INT_GET(agip->agi_freecount, ARCH_CONVERT);
+                xfs_buf_relse(agibp);
+        }
+        sbbp = xfs_getsb(mp, 0);
+#ifdef XFS_LOUD_RECOVERY
+        sbp = &mp->m_sb;
+        xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, XFS_SB_ALL_BITS);
+        cmn_err(CE_NOTE,
+                "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
+                sbp->sb_icount, itotal);
+        cmn_err(CE_NOTE,
+                "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
+                sbp->sb_ifree, ifree);
+        cmn_err(CE_NOTE,
+                "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
+                sbp->sb_fdblocks, freeblks);
+#if 0
+        /*
+         * This is turned off until I account for the allocation
+         * btree blocks which live in free space.
+         */
+        ASSERT(sbp->sb_icount == itotal);
+        ASSERT(sbp->sb_ifree == ifree);
+        ASSERT(sbp->sb_fdblocks == freeblks);
+#endif
+#endif
+        xfs_buf_relse(sbbp);
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
new file mode 100644
index 000000000000..42158b442b55
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+#define XLOG_RHASH_BITS  4
+#define XLOG_RHASH_SIZE 16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid) \
+        ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+        struct xlog_recover_item *ri_next;
+        struct xlog_recover_item *ri_prev;
+        int                      ri_type;
+        int                      ri_cnt;        /* count of regions found */
+        int                      ri_total;      /* total regions */
+        xfs_log_iovec_t          *ri_buf;       /* ptr to regions buffer */
+} xlog_recover_item_t;
+struct xlog_tid;
+typedef struct xlog_recover {
+        struct xlog_recover *r_next;
+        xlog_tid_t          r_log_tid;          /* log's transaction id */
+        xfs_trans_header_t  r_theader;          /* trans header for partial */
+        int                 r_state;            /* not needed */
+        xfs_lsn_t           r_lsn;              /* xact lsn */
+        xlog_recover_item_t *r_itemq;           /* q for items */
+} xlog_recover_t;
+#define ITEM_TYPE(i)    (*(ushort *)(i)->ri_buf[0].i_addr)
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE      64
+#define XLOG_RECOVER_PASS1      1
+#define XLOG_RECOVER_PASS2      2
+#endif  /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_mac.h b/fs/xfs/xfs_mac.h
new file mode 100644
index 000000000000..8d59aaffeb8e
--- /dev/null
+++ b/fs/xfs/xfs_mac.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MAC_H__
+#define __XFS_MAC_H__
+/*
+ * Mandatory Access Control
+ *
+ * Layout of a composite MAC label:
+ * ml_list contains the list of categories (MSEN) followed by the list of
+ * divisions (MINT). This is actually a header for the data structure which
+ * will have an ml_list with more than one element.
+ *
+ *      -------------------------------
+ *      | ml_msen_type | ml_mint_type |
+ *      -------------------------------
+ *      | ml_level     | ml_grade     |
+ *      -------------------------------
+ *      | ml_catcount                 |
+ *      -------------------------------
+ *      | ml_divcount                 |
+ *      -------------------------------
+ *      | category 1                  |
+ *      | . . .                       |
+ *      | category N                  | (where N = ml_catcount)
+ *      -------------------------------
+ *      | division 1                  |
+ *      | . . .                       |
+ *      | division M                  | (where M = ml_divcount)
+ *      -------------------------------
+ */
+#define XFS_MAC_MAX_SETS        250
+typedef struct xfs_mac_label {
+        __uint8_t       ml_msen_type;   /* MSEN label type */
+        __uint8_t       ml_mint_type;   /* MINT label type */
+        __uint8_t       ml_level;       /* Hierarchical level */
+        __uint8_t       ml_grade;       /* Hierarchical grade */
+        __uint16_t      ml_catcount;    /* Category count */
+        __uint16_t      ml_divcount;    /* Division count */
+                                        /* Category set, then Division set */
+        __uint16_t      ml_list[XFS_MAC_MAX_SETS];
+} xfs_mac_label_t;
+/* MSEN label type names. Choose an upper case ASCII character.  */
+#define XFS_MSEN_ADMIN_LABEL    'A'     /* Admin: low<admin != tcsec<high */
+#define XFS_MSEN_EQUAL_LABEL    'E'     /* Wildcard - always equal */
+#define XFS_MSEN_HIGH_LABEL     'H'     /* System High - always dominates */
+#define XFS_MSEN_MLD_HIGH_LABEL 'I'     /* System High, multi-level dir */
+#define XFS_MSEN_LOW_LABEL      'L'     /* System Low - always dominated */
+#define XFS_MSEN_MLD_LABEL      'M'     /* TCSEC label on a multi-level dir */
+#define XFS_MSEN_MLD_LOW_LABEL  'N'     /* System Low, multi-level dir */
+#define XFS_MSEN_TCSEC_LABEL    'T'     /* TCSEC label */
+#define XFS_MSEN_UNKNOWN_LABEL  'U'     /* unknown label */
+/* MINT label type names. Choose a lower case ASCII character.  */
+#define XFS_MINT_BIBA_LABEL     'b'     /* Dual of a TCSEC label */
+#define XFS_MINT_EQUAL_LABEL    'e'     /* Wildcard - always equal */
+#define XFS_MINT_HIGH_LABEL     'h'     /* High Grade - always dominates */
+#define XFS_MINT_LOW_LABEL      'l'     /* Low Grade - always dominated */
+/* On-disk XFS extended attribute names */
+#define SGI_MAC_FILE    "SGI_MAC_FILE"
+#define SGI_MAC_FILE_SIZE       (sizeof(SGI_MAC_FILE)-1)
+#ifdef __KERNEL__
+#ifdef CONFIG_FS_POSIX_MAC
+/* NOT YET IMPLEMENTED */
+#define MACEXEC         00100
+#define MACWRITE        00200
+#define MACREAD         00400
+struct xfs_inode;
+extern int  xfs_mac_iaccess(struct xfs_inode *, mode_t, cred_t *);
+#define _MAC_XFS_IACCESS(i,m,c) (xfs_mac_iaccess(i,m,c))
+#define _MAC_VACCESS(v,c,m)     (xfs_mac_vaccess(v,c,m))
+#define _MAC_EXISTS             xfs_mac_vhaslabel
+#else
+#define _MAC_XFS_IACCESS(i,m,c) (0)
+#define _MAC_VACCESS(v,c,m)     (0)
+#define _MAC_EXISTS             (NULL)
+#endif
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_MAC_H__ */
diff --git a/fs/xfs/xfs_macros.c b/fs/xfs/xfs_macros.c
new file mode 100644
index 000000000000..ce4f46c6b3ab
--- /dev/null
+++ b/fs/xfs/xfs_macros.c
@@ -0,0 +1,2136 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#define XFS_MACRO_C
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_ialloc.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rw.h"
+#include "xfs_log_priv.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_dir2_data.h"
+#include "xfs_dir2_leaf.h"
+#include "xfs_dir2_block.h"
+#include "xfs_dir2_node.h"
+#include "xfs_bit.h"
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLDSTARTBLOCK)
+int
+isnulldstartblock(xfs_dfsbno_t x)
+{
+        return ISNULLDSTARTBLOCK(x);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLSTARTBLOCK)
+int
+isnullstartblock(xfs_fsblock_t x)
+{
+        return ISNULLSTARTBLOCK(x);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_NULLSTARTBLOCK)
+xfs_fsblock_t
+nullstartblock(int k)
+{
+        return NULLSTARTBLOCK(k);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_STARTBLOCKVAL)
+xfs_filblks_t
+startblockval(xfs_fsblock_t x)
+{
+        return STARTBLOCKVAL(x);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_CHECK_DADDR)
+void
+xfs_ag_check_daddr(xfs_mount_t *mp, xfs_daddr_t d, xfs_extlen_t len)
+{
+        XFS_AG_CHECK_DADDR(mp, d, len);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_DADDR)
+xfs_daddr_t
+xfs_ag_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_daddr_t d)
+{
+        return XFS_AG_DADDR(mp, agno, d);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MAXLEVELS)
+int
+xfs_ag_maxlevels(xfs_mount_t *mp)
+{
+        return XFS_AG_MAXLEVELS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_DADDR)
+xfs_daddr_t
+xfs_agb_to_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
+{
+        return XFS_AGB_TO_DADDR(mp, agno, agbno);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_FSB)
+xfs_fsblock_t
+xfs_agb_to_fsb(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
+{
+        return XFS_AGB_TO_FSB(mp, agno, agbno);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MAX)
+xfs_agblock_t
+xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b)
+{
+        return XFS_AGBLOCK_MAX(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MIN)
+xfs_agblock_t
+xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b)
+{
+        return XFS_AGBLOCK_MIN(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_BLOCK)
+xfs_agblock_t
+xfs_agf_block(xfs_mount_t *mp)
+{
+        return XFS_AGF_BLOCK(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_GOOD_VERSION)
+int
+xfs_agf_good_version(unsigned v)
+{
+        return XFS_AGF_GOOD_VERSION(v);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGFL_BLOCK)
+xfs_agblock_t
+xfs_agfl_block(xfs_mount_t *mp)
+{
+        return XFS_AGFL_BLOCK(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_BLOCK)
+xfs_agblock_t
+xfs_agi_block(xfs_mount_t *mp)
+{
+        return XFS_AGI_BLOCK(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_GOOD_VERSION)
+int
+xfs_agi_good_version(unsigned v)
+{
+        return XFS_AGI_GOOD_VERSION(v);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_AGBNO)
+xfs_agblock_t
+xfs_agino_to_agbno(xfs_mount_t *mp, xfs_agino_t i)
+{
+        return XFS_AGINO_TO_AGBNO(mp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_INO)
+xfs_ino_t
+xfs_agino_to_ino(xfs_mount_t *mp, xfs_agnumber_t a, xfs_agino_t i)
+{
+        return XFS_AGINO_TO_INO(mp, a, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_OFFSET)
+int
+xfs_agino_to_offset(xfs_mount_t *mp, xfs_agino_t i)
+{
+        return XFS_AGINO_TO_OFFSET(mp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
+int
+xfs_alloc_block_maxrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_ALLOC_BLOCK_MAXRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
+int
+xfs_alloc_block_minrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_ALLOC_BLOCK_MINRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_alloc_block_size(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_ALLOC_BLOCK_SIZE(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_alloc_key_t *
+xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_ALLOC_KEY_ADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_PTR_ADDR)
+xfs_alloc_ptr_t *
+xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_ALLOC_PTR_ADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_REC_ADDR)
+/*ARGSUSED3*/
+xfs_alloc_rec_t *
+xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_ALLOC_REC_ADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
+int
+xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+        return XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen, vlen);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
+int
+xfs_attr_leaf_entsize_local_max(int bsize)
+{
+        return XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
+int
+xfs_attr_leaf_entsize_remote(int nlen)
+{
+        return XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME)
+char *
+xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+        return XFS_ATTR_LEAF_NAME(leafp, idx);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
+xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+        return XFS_ATTR_LEAF_NAME_LOCAL(leafp, idx);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
+xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+        return XFS_ATTR_LEAF_NAME_REMOTE(leafp, idx);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE)
+int
+xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep)
+{
+        return XFS_ATTR_SF_ENTSIZE(sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
+int
+xfs_attr_sf_entsize_byname(int nlen, int vlen)
+{
+        return XFS_ATTR_SF_ENTSIZE_BYNAME(nlen, vlen);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_NEXTENTRY)
+xfs_attr_sf_entry_t *
+xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep)
+{
+        return XFS_ATTR_SF_NEXTENTRY(sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_TOTSIZE)
+int
+xfs_attr_sf_totsize(xfs_inode_t *dp)
+{
+        return XFS_ATTR_SF_TOTSIZE(dp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOI)
+xfs_inode_t *
+xfs_bhvtoi(bhv_desc_t *bhvp)
+{
+        return XFS_BHVTOI(bhvp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOM)
+xfs_mount_t *
+xfs_bhvtom(bhv_desc_t *bdp)
+{
+        return XFS_BHVTOM(bdp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_VFSTOM)
+xfs_mount_t *
+xfs_vfstom(vfs_t *vfs)
+{
+        return XFS_VFSTOM(vfs);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BM_MAXLEVELS)
+int
+xfs_bm_maxlevels(xfs_mount_t *mp, int w)
+{
+        return XFS_BM_MAXLEVELS(mp, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
+int
+xfs_bmap_block_dmaxrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_BLOCK_DMAXRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
+int
+xfs_bmap_block_dminrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_BLOCK_DMINRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DSIZE)
+int
+xfs_bmap_block_dsize(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_BLOCK_DSIZE(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
+int
+xfs_bmap_block_imaxrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_BLOCK_IMAXRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
+int
+xfs_bmap_block_iminrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_BLOCK_IMINRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_ISIZE)
+int
+xfs_bmap_block_isize(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_BLOCK_ISIZE(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+        return XFS_BMAP_BROOT_KEY_ADDR(bb, i, sz);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_MAXRECS)
+int
+xfs_bmap_broot_maxrecs(int sz)
+{
+        return XFS_BMAP_BROOT_MAXRECS(sz);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_NUMRECS)
+int
+xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb)
+{
+        return XFS_BMAP_BROOT_NUMRECS(bb);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+        return XFS_BMAP_BROOT_PTR_ADDR(bb, i, sz);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+        return XFS_BMAP_BROOT_REC_ADDR(bb, i, sz);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE)
+int
+xfs_bmap_broot_space(xfs_bmdr_block_t *bb)
+{
+        return XFS_BMAP_BROOT_SPACE(bb);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
+int
+xfs_bmap_broot_space_calc(int nrecs)
+{
+        return XFS_BMAP_BROOT_SPACE_CALC(nrecs);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_IBLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_iblock_size(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_IBLOCK_SIZE(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_INIT)
+void
+xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+        XFS_BMAP_INIT(flp, fbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_DADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_KEY_DADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_IADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_KEY_IADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_DADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_PTR_DADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_IADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_PTR_IADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_rblock_dsize(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_RBLOCK_DSIZE(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_rblock_isize(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_RBLOCK_ISIZE(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_DADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_REC_DADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_IADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_BMAP_REC_IADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_SANITY_CHECK)
+int
+xfs_bmap_sanity_check(xfs_mount_t *mp, xfs_bmbt_block_t *bb, int level)
+{
+        return XFS_BMAP_SANITY_CHECK(mp, bb, level);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAPI_AFLAG)
+int
+xfs_bmapi_aflag(int w)
+{
+        return XFS_BMAPI_AFLAG(w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMDR_SPACE_CALC)
+int
+xfs_bmdr_space_calc(int nrecs)
+{
+        return XFS_BMDR_SPACE_CALC(nrecs);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BNO_BLOCK)
+xfs_agblock_t
+xfs_bno_block(xfs_mount_t *mp)
+{
+        return XFS_BNO_BLOCK(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BTREE_LONG_PTRS)
+int
+xfs_btree_long_ptrs(xfs_btnum_t btnum)
+{
+        return XFS_BTREE_LONG_PTRS(btnum);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGF)
+xfs_agf_t *
+xfs_buf_to_agf(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_AGF(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGFL)
+xfs_agfl_t *
+xfs_buf_to_agfl(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_AGFL(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGI)
+xfs_agi_t *
+xfs_buf_to_agi(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_AGI(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
+xfs_alloc_block_t *
+xfs_buf_to_alloc_block(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_ALLOC_BLOCK(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BLOCK)
+xfs_btree_block_t *
+xfs_buf_to_block(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_BLOCK(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
+xfs_bmbt_block_t *
+xfs_buf_to_bmbt_block(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_BMBT_BLOCK(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_DINODE)
+xfs_dinode_t *
+xfs_buf_to_dinode(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_DINODE(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
+xfs_inobt_block_t *
+xfs_buf_to_inobt_block(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_INOBT_BLOCK(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_LBLOCK)
+xfs_btree_lblock_t *
+xfs_buf_to_lblock(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_LBLOCK(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBLOCK)
+xfs_btree_sblock_t *
+xfs_buf_to_sblock(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_SBLOCK(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBP)
+xfs_sb_t *
+xfs_buf_to_sbp(xfs_buf_t *bp)
+{
+        return XFS_BUF_TO_SBP(bp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_ASIZE)
+int
+xfs_cfork_asize_disk(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+        return XFS_CFORK_ASIZE_DISK(dcp, mp);
+}
+int
+xfs_cfork_asize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+        return XFS_CFORK_ASIZE(dcp, mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_BOFF)
+int
+xfs_cfork_boff_disk(xfs_dinode_core_t *dcp)
+{
+        return XFS_CFORK_BOFF_DISK(dcp);
+}
+int
+xfs_cfork_boff(xfs_dinode_core_t *dcp)
+{
+        return XFS_CFORK_BOFF(dcp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_DSIZE)
+int
+xfs_cfork_dsize_disk(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+        return XFS_CFORK_DSIZE_DISK(dcp, mp);
+}
+int
+xfs_cfork_dsize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+        return XFS_CFORK_DSIZE(dcp, mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FMT_SET)
+void
+xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n)
+{
+        XFS_CFORK_FMT_SET(dcp, w, n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FORMAT)
+int
+xfs_cfork_format(xfs_dinode_core_t *dcp, int w)
+{
+        return XFS_CFORK_FORMAT(dcp, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXT_SET)
+void
+xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n)
+{
+        XFS_CFORK_NEXT_SET(dcp, w, n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXTENTS)
+int
+xfs_cfork_nextents_disk(xfs_dinode_core_t *dcp, int w)
+{
+        return XFS_CFORK_NEXTENTS_DISK(dcp, w);
+}
+int
+xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w)
+{
+        return XFS_CFORK_NEXTENTS(dcp, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_Q)
+int
+xfs_cfork_q_disk(xfs_dinode_core_t *dcp)
+{
+        return XFS_CFORK_Q_DISK(dcp);
+}
+int
+xfs_cfork_q(xfs_dinode_core_t *dcp)
+{
+        return XFS_CFORK_Q(dcp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_SIZE)
+int
+xfs_cfork_size_disk(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w)
+{
+        return XFS_CFORK_SIZE_DISK(dcp, mp, w);
+}
+int
+xfs_cfork_size(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w)
+{
+        return XFS_CFORK_SIZE(dcp, mp, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CNT_BLOCK)
+xfs_agblock_t
+xfs_cnt_block(xfs_mount_t *mp)
+{
+        return XFS_CNT_BLOCK(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_BNO)
+xfs_dablk_t
+xfs_da_cookie_bno(xfs_mount_t *mp, xfs_off_t cookie)
+{
+        return XFS_DA_COOKIE_BNO(mp, cookie);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_ENTRY)
+int
+xfs_da_cookie_entry(xfs_mount_t *mp, xfs_off_t cookie)
+{
+        return XFS_DA_COOKIE_ENTRY(mp, cookie);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_HASH)
+/*ARGSUSED1*/
+xfs_dahash_t
+xfs_da_cookie_hash(xfs_mount_t *mp, xfs_off_t cookie)
+{
+        return XFS_DA_COOKIE_HASH(mp, cookie);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_BNOENTRY)
+__uint32_t
+xfs_da_make_bnoentry(xfs_mount_t *mp, xfs_dablk_t bno, int entry)
+{
+        return XFS_DA_MAKE_BNOENTRY(mp, bno, entry);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_COOKIE)
+xfs_off_t
+xfs_da_make_cookie(xfs_mount_t *mp, xfs_dablk_t bno, int entry,
+                   xfs_dahash_t hash)
+{
+        return XFS_DA_MAKE_COOKIE(mp, bno, entry, hash);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGBNO)
+xfs_agblock_t
+xfs_daddr_to_agbno(xfs_mount_t *mp, xfs_daddr_t d)
+{
+        return XFS_DADDR_TO_AGBNO(mp, d);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGNO)
+xfs_agnumber_t
+xfs_daddr_to_agno(xfs_mount_t *mp, xfs_daddr_t d)
+{
+        return XFS_DADDR_TO_AGNO(mp, d);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_FSB)
+xfs_fsblock_t
+xfs_daddr_to_fsb(xfs_mount_t *mp, xfs_daddr_t d)
+{
+        return XFS_DADDR_TO_FSB(mp, d);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_APTR)
+char *
+xfs_dfork_aptr(xfs_dinode_t *dip)
+{
+        return XFS_DFORK_APTR(dip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_ASIZE)
+int
+xfs_dfork_asize(xfs_dinode_t *dip, xfs_mount_t *mp)
+{
+        return XFS_DFORK_ASIZE(dip, mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_BOFF)
+int
+xfs_dfork_boff(xfs_dinode_t *dip)
+{
+        return XFS_DFORK_BOFF(dip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DPTR)
+char *
+xfs_dfork_dptr(xfs_dinode_t *dip)
+{
+        return XFS_DFORK_DPTR(dip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DSIZE)
+int
+xfs_dfork_dsize(xfs_dinode_t *dip, xfs_mount_t *mp)
+{
+        return XFS_DFORK_DSIZE(dip, mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_NEXTENTS)
+int
+xfs_dfork_nextents(xfs_dinode_t *dip, int w)
+{
+        return XFS_DFORK_NEXTENTS(dip, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_PTR)
+char *
+xfs_dfork_ptr(xfs_dinode_t *dip, int w)
+{
+        return XFS_DFORK_PTR(dip, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_Q)
+int
+xfs_dfork_q(xfs_dinode_t *dip)
+{
+        return XFS_DFORK_Q(dip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_SIZE)
+int
+xfs_dfork_size(xfs_dinode_t *dip, xfs_mount_t *mp, int w)
+{
+        return XFS_DFORK_SIZE(dip, mp, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DINODE_GOOD_VERSION)
+int
+xfs_dinode_good_version(int v)
+{
+        return XFS_DINODE_GOOD_VERSION(v);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
+int
+xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
+{
+        return XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
+int
+xfs_dir_leaf_entsize_byname(int len)
+{
+        return XFS_DIR_LEAF_ENTSIZE_BYNAME(len);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
+xfs_dir_leaf_name_t *
+xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
+{
+        return XFS_DIR_LEAF_NAMESTRUCT(leafp, offset);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ALLFIT)
+int
+xfs_dir_sf_allfit(int count, int totallen)
+{
+        return XFS_DIR_SF_ALLFIT(count, totallen);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
+int
+xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
+{
+        return XFS_DIR_SF_ENTSIZE_BYENTRY(sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
+int
+xfs_dir_sf_entsize_byname(int len)
+{
+        return XFS_DIR_SF_ENTSIZE_BYNAME(len);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_GET_DIRINO)
+void
+xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
+{
+        XFS_DIR_SF_GET_DIRINO(from, to);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_NEXTENTRY)
+xfs_dir_sf_entry_t *
+xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
+{
+        return XFS_DIR_SF_NEXTENTRY(sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_PUT_DIRINO)
+void
+xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
+{
+        XFS_DIR_SF_PUT_DIRINO(from, to);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
+xfs_dir2_leaf_entry_t *
+xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp)
+{
+        return XFS_DIR2_BLOCK_LEAF_P(btp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
+xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(xfs_mount_t *mp, xfs_dir2_block_t *block)
+{
+        return XFS_DIR2_BLOCK_TAIL_P(mp, block);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DA)
+xfs_dablk_t
+xfs_dir2_byte_to_da(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+        return XFS_DIR2_BYTE_TO_DA(mp, by);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
+/* ARGSUSED */
+xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+        return XFS_DIR2_BYTE_TO_DATAPTR(mp, by);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_byte_to_db(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+        return XFS_DIR2_BYTE_TO_DB(mp, by);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+        return XFS_DIR2_BYTE_TO_OFF(mp, by);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_da_to_byte(xfs_mount_t *mp, xfs_dablk_t da)
+{
+        return XFS_DIR2_DA_TO_BYTE(mp, da);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_da_to_db(xfs_mount_t *mp, xfs_dablk_t da)
+{
+        return XFS_DIR2_DA_TO_DB(mp, da);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
+xfs_dir2_data_off_t *
+xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
+{
+        return XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTSIZE)
+int
+xfs_dir2_data_entsize(int n)
+{
+        return XFS_DIR2_DATA_ENTSIZE(n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
+xfs_dir2_data_off_t *
+xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup)
+{
+        return XFS_DIR2_DATA_UNUSED_TAG_P(dup);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
+/* ARGSUSED */
+xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+        return XFS_DIR2_DATAPTR_TO_BYTE(mp, dp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+        return XFS_DIR2_DATAPTR_TO_DB(mp, dp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+        return XFS_DIR2_DATAPTR_TO_OFF(mp, dp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(xfs_mount_t *mp, xfs_dir2_db_t db,
+                        xfs_dir2_data_aoff_t o)
+{
+        return XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(xfs_mount_t *mp, xfs_dir2_db_t db,
+                           xfs_dir2_data_aoff_t o)
+{
+        return XFS_DIR2_DB_OFF_TO_DATAPTR(mp, db, o);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_DA)
+xfs_dablk_t
+xfs_dir2_db_to_da(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+        return XFS_DIR2_DB_TO_DA(mp, db);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDB)
+xfs_dir2_db_t
+xfs_dir2_db_to_fdb(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+        return XFS_DIR2_DB_TO_FDB(mp, db);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
+int
+xfs_dir2_db_to_fdindex(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+        return XFS_DIR2_DB_TO_FDINDEX(mp, db);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_BESTS_P)
+xfs_dir2_data_off_t *
+xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp)
+{
+        return XFS_DIR2_LEAF_BESTS_P(ltp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_TAIL_P)
+xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(xfs_mount_t *mp, xfs_dir2_leaf_t *lp)
+{
+        return XFS_DIR2_LEAF_TAIL_P(mp, lp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
+int
+xfs_dir2_max_leaf_ents(xfs_mount_t *mp)
+{
+        return XFS_DIR2_MAX_LEAF_ENTS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
+int
+xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+        return XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp, sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
+xfs_dir2_sf_entry_t *
+xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
+{
+        return XFS_DIR2_SF_FIRSTENTRY(sfp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
+int
+xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
+{
+        return XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, len);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_INUMBER)
+xfs_intino_t
+xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from)
+{
+        return XFS_DIR2_SF_GET_INUMBER(sfp, from);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_OFFSET)
+xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
+{
+        return XFS_DIR2_SF_GET_OFFSET(sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_HDR_SIZE)
+int
+xfs_dir2_sf_hdr_size(int i8count)
+{
+        return XFS_DIR2_SF_HDR_SIZE(i8count);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_INUMBERP)
+xfs_dir2_inou_t *
+xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
+{
+        return XFS_DIR2_SF_INUMBERP(sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_NEXTENTRY)
+xfs_dir2_sf_entry_t *
+xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+        return XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
+void
+xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, xfs_dir2_inou_t *to)
+{
+        XFS_DIR2_SF_PUT_INUMBER(sfp, from, to);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
+void
+xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
+{
+        XFS_DIR2_SF_PUT_OFFSET(sfep, off);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTFMT_INODE )
+xfs_exntfmt_t
+xfs_extfmt_inode(struct xfs_inode *ip)
+{
+        return XFS_EXTFMT_INODE(ip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MAX)
+xfs_extlen_t
+xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b)
+{
+        return XFS_EXTLEN_MAX(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MIN)
+xfs_extlen_t
+xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b)
+{
+        return XFS_EXTLEN_MIN(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MAX)
+xfs_filblks_t
+xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b)
+{
+        return XFS_FILBLKS_MAX(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MIN)
+xfs_filblks_t
+xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b)
+{
+        return XFS_FILBLKS_MIN(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MAX)
+xfs_fileoff_t
+xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b)
+{
+        return XFS_FILEOFF_MAX(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MIN)
+xfs_fileoff_t
+xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b)
+{
+        return XFS_FILEOFF_MIN(a, b);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_SANITY_CHECK)
+int
+xfs_fsb_sanity_check(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+        return XFS_FSB_SANITY_CHECK(mp, fsbno);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGBNO)
+xfs_agblock_t
+xfs_fsb_to_agbno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+        return XFS_FSB_TO_AGBNO(mp, fsbno);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGNO)
+xfs_agnumber_t
+xfs_fsb_to_agno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+        return XFS_FSB_TO_AGNO(mp, fsbno);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DADDR)
+xfs_daddr_t
+xfs_fsb_to_daddr(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+        return XFS_FSB_TO_DADDR(mp, fsbno);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DB)
+xfs_daddr_t
+xfs_fsb_to_db(xfs_inode_t *ip, xfs_fsblock_t fsb)
+{
+        return XFS_FSB_TO_DB(ip, fsb);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_HDR_BLOCK)
+xfs_agblock_t
+xfs_hdr_block(xfs_mount_t *mp, xfs_daddr_t d)
+{
+        return XFS_HDR_BLOCK(mp, d);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_BLOCKS)
+xfs_extlen_t
+xfs_ialloc_blocks(xfs_mount_t *mp)
+{
+        return XFS_IALLOC_BLOCKS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_FIND_FREE)
+int
+xfs_ialloc_find_free(xfs_inofree_t *fp)
+{
+        return XFS_IALLOC_FIND_FREE(fp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_INODES)
+int
+xfs_ialloc_inodes(xfs_mount_t *mp)
+{
+        return XFS_IALLOC_INODES(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IBT_BLOCK)
+xfs_agblock_t
+xfs_ibt_block(xfs_mount_t *mp)
+{
+        return XFS_IBT_BLOCK(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_ASIZE)
+int
+xfs_ifork_asize(xfs_inode_t *ip)
+{
+        return XFS_IFORK_ASIZE(ip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_DSIZE)
+int
+xfs_ifork_dsize(xfs_inode_t *ip)
+{
+        return XFS_IFORK_DSIZE(ip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FMT_SET)
+void
+xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n)
+{
+        XFS_IFORK_FMT_SET(ip, w, n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FORMAT)
+int
+xfs_ifork_format(xfs_inode_t *ip, int w)
+{
+        return XFS_IFORK_FORMAT(ip, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXT_SET)
+void
+xfs_ifork_next_set(xfs_inode_t *ip, int w, int n)
+{
+        XFS_IFORK_NEXT_SET(ip, w, n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXTENTS)
+int
+xfs_ifork_nextents(xfs_inode_t *ip, int w)
+{
+        return XFS_IFORK_NEXTENTS(ip, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_PTR)
+xfs_ifork_t *
+xfs_ifork_ptr(xfs_inode_t *ip, int w)
+{
+        return XFS_IFORK_PTR(ip, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_Q)
+int
+xfs_ifork_q(xfs_inode_t *ip)
+{
+        return XFS_IFORK_Q(ip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_SIZE)
+int
+xfs_ifork_size(xfs_inode_t *ip, int w)
+{
+        return XFS_IFORK_SIZE(ip, w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FBROOT)
+int
+xfs_ilog_fbroot(int w)
+{
+        return XFS_ILOG_FBROOT(w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FDATA)
+int
+xfs_ilog_fdata(int w)
+{
+        return XFS_ILOG_FDATA(w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FEXT)
+int
+xfs_ilog_fext(int w)
+{
+        return XFS_ILOG_FEXT(w);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IN_MAXLEVELS)
+int
+xfs_in_maxlevels(xfs_mount_t *mp)
+{
+        return XFS_IN_MAXLEVELS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGBNO_BITS)
+int
+xfs_ino_agbno_bits(xfs_mount_t *mp)
+{
+        return XFS_INO_AGBNO_BITS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGINO_BITS)
+int
+xfs_ino_agino_bits(xfs_mount_t *mp)
+{
+        return XFS_INO_AGINO_BITS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGNO_BITS)
+int
+xfs_ino_agno_bits(xfs_mount_t *mp)
+{
+        return XFS_INO_AGNO_BITS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_BITS)
+int
+xfs_ino_bits(xfs_mount_t *mp)
+{
+        return XFS_INO_BITS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_MASK)
+__uint32_t
+xfs_ino_mask(int k)
+{
+        return XFS_INO_MASK(k);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_OFFSET_BITS)
+int
+xfs_ino_offset_bits(xfs_mount_t *mp)
+{
+        return XFS_INO_OFFSET_BITS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGBNO)
+xfs_agblock_t
+xfs_ino_to_agbno(xfs_mount_t *mp, xfs_ino_t i)
+{
+        return XFS_INO_TO_AGBNO(mp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGINO)
+xfs_agino_t
+xfs_ino_to_agino(xfs_mount_t *mp, xfs_ino_t i)
+{
+        return XFS_INO_TO_AGINO(mp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGNO)
+xfs_agnumber_t
+xfs_ino_to_agno(xfs_mount_t *mp, xfs_ino_t i)
+{
+        return XFS_INO_TO_AGNO(mp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_FSB)
+xfs_fsblock_t
+xfs_ino_to_fsb(xfs_mount_t *mp, xfs_ino_t i)
+{
+        return XFS_INO_TO_FSB(mp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_OFFSET)
+int
+xfs_ino_to_offset(xfs_mount_t *mp, xfs_ino_t i)
+{
+        return XFS_INO_TO_OFFSET(mp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
+int
+xfs_inobt_block_maxrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_INOBT_BLOCK_MAXRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MINRECS)
+int
+xfs_inobt_block_minrecs(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_INOBT_BLOCK_MINRECS(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_inobt_block_size(int lev, xfs_btree_cur_t *cur)
+{
+        return XFS_INOBT_BLOCK_SIZE(lev, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_CLR_FREE)
+void
+xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i)
+{
+        XFS_INOBT_CLR_FREE(rp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_FREE)
+int
+xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i)
+{
+        return XFS_INOBT_IS_FREE(rp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_LAST_REC)
+int
+xfs_inobt_is_last_rec(xfs_btree_cur_t *cur)
+{
+        return XFS_INOBT_IS_LAST_REC(cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_inobt_key_t *
+xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_INOBT_KEY_ADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASK)
+xfs_inofree_t
+xfs_inobt_mask(int i)
+{
+        return XFS_INOBT_MASK(i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASKN)
+xfs_inofree_t
+xfs_inobt_maskn(int i, int n)
+{
+        return XFS_INOBT_MASKN(i, n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_PTR_ADDR)
+xfs_inobt_ptr_t *
+xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_INOBT_PTR_ADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_REC_ADDR)
+/*ARGSUSED3*/
+xfs_inobt_rec_t *
+xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+        return XFS_INOBT_REC_ADDR(bb, i, cur);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_SET_FREE)
+void
+xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i)
+{
+        XFS_INOBT_SET_FREE(rp, i);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOBHV)
+bhv_desc_t *
+xfs_itobhv(xfs_inode_t *ip)
+{
+        return XFS_ITOBHV(ip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOV)
+vnode_t *
+xfs_itov(xfs_inode_t *ip)
+{
+        return XFS_ITOV(ip);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBLOG)
+int
+xfs_lblog(xfs_mount_t *mp)
+{
+        return XFS_LBLOG(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBSIZE)
+int
+xfs_lbsize(xfs_mount_t *mp)
+{
+        return XFS_LBSIZE(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ALL_FREE)
+void
+xfs_lic_all_free(xfs_log_item_chunk_t *cp)
+{
+        XFS_LIC_ALL_FREE(cp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ARE_ALL_FREE)
+int
+xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
+{
+        return XFS_LIC_ARE_ALL_FREE(cp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_CLAIM)
+void
+xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
+{
+        XFS_LIC_CLAIM(cp, slot);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_CHUNK)
+xfs_log_item_chunk_t *
+xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
+{
+        return XFS_LIC_DESC_TO_CHUNK(dp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_SLOT)
+int
+xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
+{
+        return XFS_LIC_DESC_TO_SLOT(dp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT)
+void
+xfs_lic_init(xfs_log_item_chunk_t *cp)
+{
+        XFS_LIC_INIT(cp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT_SLOT)
+void
+xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
+{
+        XFS_LIC_INIT_SLOT(cp, slot);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ISFREE)
+int
+xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
+{
+        return XFS_LIC_ISFREE(cp, slot);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_RELSE)
+void
+xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
+{
+        XFS_LIC_RELSE(cp, slot);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_SLOT)
+xfs_log_item_desc_t *
+xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
+{
+        return XFS_LIC_SLOT(cp, slot);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_VACANCY)
+int
+xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
+{
+        return XFS_LIC_VACANCY(cp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LITINO)
+int
+xfs_litino(xfs_mount_t *mp)
+{
+        return XFS_LITINO(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MAKE_IPTR)
+xfs_dinode_t *
+xfs_make_iptr(xfs_mount_t *mp, xfs_buf_t *b, int o)
+{
+        return XFS_MAKE_IPTR(mp, b, o);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32HI)
+__uint32_t
+xfs_mask32hi(int n)
+{
+        return XFS_MASK32HI(n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32LO)
+__uint32_t
+xfs_mask32lo(int n)
+{
+        return XFS_MASK32LO(n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64HI)
+__uint64_t
+xfs_mask64hi(int n)
+{
+        return XFS_MASK64HI(n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64LO)
+__uint64_t
+xfs_mask64lo(int n)
+{
+        return XFS_MASK64LO(n);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST)
+int
+xfs_min_freelist(xfs_agf_t *a, xfs_mount_t *mp)
+{
+        return XFS_MIN_FREELIST(a, mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_PAG)
+int
+xfs_min_freelist_pag(xfs_perag_t *pag, xfs_mount_t *mp)
+{
+        return XFS_MIN_FREELIST_PAG(pag, mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_RAW)
+int
+xfs_min_freelist_raw(uint bl, uint cl, xfs_mount_t *mp)
+{
+        return XFS_MIN_FREELIST_RAW(bl, cl, mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MTOVFS)
+vfs_t *
+xfs_mtovfs(xfs_mount_t *mp)
+{
+        return XFS_MTOVFS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_OFFBNO_TO_AGINO)
+xfs_agino_t
+xfs_offbno_to_agino(xfs_mount_t *mp, xfs_agblock_t b, int o)
+{
+        return XFS_OFFBNO_TO_AGINO(mp, b, o);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_PREALLOC_BLOCKS)
+xfs_agblock_t
+xfs_prealloc_blocks(xfs_mount_t *mp)
+{
+        return XFS_PREALLOC_BLOCKS(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_BLOCK)
+xfs_agblock_t
+xfs_sb_block(xfs_mount_t *mp)
+{
+        return XFS_SB_BLOCK(mp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_GOOD_VERSION)
+int
+xfs_sb_good_version(xfs_sb_t *sbp)
+{
+        return XFS_SB_GOOD_VERSION(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDATTR)
+void
+xfs_sb_version_addattr(xfs_sb_t *sbp)
+{
+        XFS_SB_VERSION_ADDATTR(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDDALIGN)
+void
+xfs_sb_version_adddalign(xfs_sb_t *sbp)
+{
+        XFS_SB_VERSION_ADDDALIGN(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDNLINK)
+void
+xfs_sb_version_addnlink(xfs_sb_t *sbp)
+{
+        XFS_SB_VERSION_ADDNLINK(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDQUOTA)
+void
+xfs_sb_version_addquota(xfs_sb_t *sbp)
+{
+        XFS_SB_VERSION_ADDQUOTA(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDSHARED)
+void
+xfs_sb_version_addshared(xfs_sb_t *sbp)
+{
+        XFS_SB_VERSION_ADDSHARED(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASALIGN)
+int
+xfs_sb_version_hasalign(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASALIGN(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASATTR)
+int
+xfs_sb_version_hasattr(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASATTR(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDALIGN)
+int
+xfs_sb_version_hasdalign(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASDALIGN(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDIRV2)
+int
+xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASDIRV2(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
+int
+xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASEXTFLGBIT(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASNLINK)
+int
+xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASNLINK(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASQUOTA)
+int
+xfs_sb_version_hasquota(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASQUOTA(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASSHARED)
+int
+xfs_sb_version_hasshared(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASSHARED(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_NUM)
+int
+xfs_sb_version_num(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_NUM(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBALIGN)
+void
+xfs_sb_version_subalign(xfs_sb_t *sbp)
+{
+        XFS_SB_VERSION_SUBALIGN(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBSHARED)
+void
+xfs_sb_version_subshared(xfs_sb_t *sbp)
+{
+        XFS_SB_VERSION_SUBSHARED(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASLOGV2)
+int
+xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASLOGV2(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASSECTOR)
+int
+xfs_sb_version_hassector(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASSECTOR(sbp);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TONEW)
+unsigned
+xfs_sb_version_tonew(unsigned v)
+{
+        return XFS_SB_VERSION_TONEW(v);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TOOLD)
+unsigned
+xfs_sb_version_toold(unsigned v)
+{
+        return XFS_SB_VERSION_TOOLD(v);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_ADD_SPACE)
+void
+xlog_grant_add_space(xlog_t *log, int bytes, int type)
+{
+        XLOG_GRANT_ADD_SPACE(log, bytes, type);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_SUB_SPACE)
+void
+xlog_grant_sub_space(xlog_t *log, int bytes, int type)
+{
+        XLOG_GRANT_SUB_SPACE(log, bytes, type);
+}
+#endif
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASMOREBITS)
+int
+xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
+{
+        return XFS_SB_VERSION_HASMOREBITS(sbp);
+}
+#endif
diff --git a/fs/xfs/xfs_macros.h b/fs/xfs/xfs_macros.h
new file mode 100644
index 000000000000..0a9307514a48
--- /dev/null
+++ b/fs/xfs/xfs_macros.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MACROS_H__
+#define __XFS_MACROS_H__
+/*
+ * Set for debug kernels and simulation
+ * These replacements save space.
+ * Used in xfs_macros.c.
+ */
+#define XFS_WANT_SPACE_C        \
+        (!defined(_STANDALONE) && defined(DEBUG))
+/*
+ * Set for debug simulation and kernel builds, but not for standalone.
+ * These replacements do not save space.
+ * Used in xfs_macros.c.
+ */
+#define XFS_WANT_FUNCS_C        \
+        (!defined(_STANDALONE) && defined(DEBUG))
+/*
+ * Corresponding names used in .h files.
+ */
+#define XFS_WANT_SPACE  (XFS_WANT_SPACE_C && !defined(XFS_MACRO_C))
+#define XFS_WANT_FUNCS  (XFS_WANT_FUNCS_C && !defined(XFS_MACRO_C))
+/*
+ * These are the macros that get turned into functions to save space.
+ */
+#define XFSSO_NULLSTARTBLOCK 1
+#define XFSSO_XFS_AGB_TO_DADDR 1
+#define XFSSO_XFS_AGB_TO_FSB 1
+#define XFSSO_XFS_AGINO_TO_INO 1
+#define XFSSO_XFS_ALLOC_BLOCK_MINRECS 1
+#define XFSSO_XFS_ATTR_SF_NEXTENTRY 1
+#define XFSSO_XFS_BMAP_BLOCK_DMAXRECS 1
+#define XFSSO_XFS_BMAP_BLOCK_IMAXRECS 1
+#define XFSSO_XFS_BMAP_BLOCK_IMINRECS 1
+#define XFSSO_XFS_BMAP_INIT 1
+#define XFSSO_XFS_BMAP_PTR_IADDR 1
+#define XFSSO_XFS_BMAP_SANITY_CHECK 1
+#define XFSSO_XFS_BMAPI_AFLAG 1
+#define XFSSO_XFS_CFORK_SIZE 1
+#define XFSSO_XFS_DA_COOKIE_BNO 1
+#define XFSSO_XFS_DA_COOKIE_ENTRY 1
+#define XFSSO_XFS_DADDR_TO_AGBNO 1
+#define XFSSO_XFS_DADDR_TO_FSB 1
+#define XFSSO_XFS_DFORK_PTR 1
+#define XFSSO_XFS_DIR_SF_GET_DIRINO 1
+#define XFSSO_XFS_DIR_SF_NEXTENTRY 1
+#define XFSSO_XFS_DIR_SF_PUT_DIRINO 1
+#define XFSSO_XFS_FILBLKS_MIN 1
+#define XFSSO_XFS_FSB_SANITY_CHECK 1
+#define XFSSO_XFS_FSB_TO_DADDR 1
+#define XFSSO_XFS_FSB_TO_DB 1
+#define XFSSO_XFS_IALLOC_INODES 1
+#define XFSSO_XFS_IFORK_ASIZE 1
+#define XFSSO_XFS_IFORK_DSIZE 1
+#define XFSSO_XFS_IFORK_FORMAT 1
+#define XFSSO_XFS_IFORK_NEXT_SET 1
+#define XFSSO_XFS_IFORK_NEXTENTS 1
+#define XFSSO_XFS_IFORK_PTR 1
+#define XFSSO_XFS_ILOG_FBROOT 1
+#define XFSSO_XFS_ILOG_FEXT 1
+#define XFSSO_XFS_INO_MASK 1
+#define XFSSO_XFS_INO_TO_FSB 1
+#define XFSSO_XFS_INODE_CLEAR_READ_AHEAD 1
+#define XFSSO_XFS_MIN_FREELIST 1
+#define XFSSO_XFS_SB_GOOD_VERSION 1
+#define XFSSO_XFS_SB_VERSION_HASNLINK 1
+#define XFSSO_XLOG_GRANT_ADD_SPACE 1
+#define XFSSO_XLOG_GRANT_SUB_SPACE 1
+#endif  /* __XFS_MACROS_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
new file mode 100644
index 000000000000..b57423caef9b
--- /dev/null
+++ b/fs/xfs/xfs_mount.c
@@ -0,0 +1,1586 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_rw.h"
+#include "xfs_quota.h"
+#include "xfs_fsops.h"
+STATIC void     xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
+STATIC int      xfs_uuid_mount(xfs_mount_t *);
+STATIC void     xfs_uuid_unmount(xfs_mount_t *mp);
+static struct {
+    short offset;
+    short type;     /* 0 = integer
+                * 1 = binary / string (no translation)
+                */
+} xfs_sb_info[] = {
+    { offsetof(xfs_sb_t, sb_magicnum),   0 },
+    { offsetof(xfs_sb_t, sb_blocksize),  0 },
+    { offsetof(xfs_sb_t, sb_dblocks),    0 },
+    { offsetof(xfs_sb_t, sb_rblocks),    0 },
+    { offsetof(xfs_sb_t, sb_rextents),   0 },
+    { offsetof(xfs_sb_t, sb_uuid),       1 },
+    { offsetof(xfs_sb_t, sb_logstart),   0 },
+    { offsetof(xfs_sb_t, sb_rootino),    0 },
+    { offsetof(xfs_sb_t, sb_rbmino),     0 },
+    { offsetof(xfs_sb_t, sb_rsumino),    0 },
+    { offsetof(xfs_sb_t, sb_rextsize),   0 },
+    { offsetof(xfs_sb_t, sb_agblocks),   0 },
+    { offsetof(xfs_sb_t, sb_agcount),    0 },
+    { offsetof(xfs_sb_t, sb_rbmblocks),  0 },
+    { offsetof(xfs_sb_t, sb_logblocks),  0 },
+    { offsetof(xfs_sb_t, sb_versionnum), 0 },
+    { offsetof(xfs_sb_t, sb_sectsize),   0 },
+    { offsetof(xfs_sb_t, sb_inodesize),  0 },
+    { offsetof(xfs_sb_t, sb_inopblock),  0 },
+    { offsetof(xfs_sb_t, sb_fname[0]),   1 },
+    { offsetof(xfs_sb_t, sb_blocklog),   0 },
+    { offsetof(xfs_sb_t, sb_sectlog),    0 },
+    { offsetof(xfs_sb_t, sb_inodelog),   0 },
+    { offsetof(xfs_sb_t, sb_inopblog),   0 },
+    { offsetof(xfs_sb_t, sb_agblklog),   0 },
+    { offsetof(xfs_sb_t, sb_rextslog),   0 },
+    { offsetof(xfs_sb_t, sb_inprogress), 0 },
+    { offsetof(xfs_sb_t, sb_imax_pct),   0 },
+    { offsetof(xfs_sb_t, sb_icount),     0 },
+    { offsetof(xfs_sb_t, sb_ifree),      0 },
+    { offsetof(xfs_sb_t, sb_fdblocks),   0 },
+    { offsetof(xfs_sb_t, sb_frextents),  0 },
+    { offsetof(xfs_sb_t, sb_uquotino),   0 },
+    { offsetof(xfs_sb_t, sb_gquotino),   0 },
+    { offsetof(xfs_sb_t, sb_qflags),     0 },
+    { offsetof(xfs_sb_t, sb_flags),      0 },
+    { offsetof(xfs_sb_t, sb_shared_vn),  0 },
+    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
+    { offsetof(xfs_sb_t, sb_unit),       0 },
+    { offsetof(xfs_sb_t, sb_width),      0 },
+    { offsetof(xfs_sb_t, sb_dirblklog),  0 },
+    { offsetof(xfs_sb_t, sb_logsectlog), 0 },
+    { offsetof(xfs_sb_t, sb_logsectsize),0 },
+    { offsetof(xfs_sb_t, sb_logsunit),   0 },
+    { offsetof(xfs_sb_t, sb_features2),  0 },
+    { sizeof(xfs_sb_t),                  0 }
+};
+/*
+ * Return a pointer to an initialized xfs_mount structure.
+ */
+xfs_mount_t *
+xfs_mount_init(void)
+{
+        xfs_mount_t *mp;
+        mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
+        AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
+        spinlock_init(&mp->m_sb_lock, "xfs_sb");
+        mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock");
+        initnsema(&mp->m_growlock, 1, "xfs_grow");
+        /*
+         * Initialize the AIL.
+         */
+        xfs_trans_ail_init(mp);
+        atomic_set(&mp->m_active_trans, 0);
+        return mp;
+}
+/*
+ * Free up the resources associated with a mount structure.  Assume that
+ * the structure was initially zeroed, so we can tell which fields got
+ * initialized.
+ */
+void
+xfs_mount_free(
+        xfs_mount_t *mp,
+        int         remove_bhv)
+{
+        if (mp->m_ihash)
+                xfs_ihash_free(mp);
+        if (mp->m_chash)
+                xfs_chash_free(mp);
+        if (mp->m_perag) {
+                int     agno;
+                for (agno = 0; agno < mp->m_maxagi; agno++)
+                        if (mp->m_perag[agno].pagb_list)
+                                kmem_free(mp->m_perag[agno].pagb_list,
+                                                sizeof(xfs_perag_busy_t) *
+                                                        XFS_PAGB_NUM_SLOTS);
+                kmem_free(mp->m_perag,
+                          sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
+        }
+        AIL_LOCK_DESTROY(&mp->m_ail_lock);
+        spinlock_destroy(&mp->m_sb_lock);
+        mutex_destroy(&mp->m_ilock);
+        freesema(&mp->m_growlock);
+        if (mp->m_quotainfo)
+                XFS_QM_DONE(mp);
+        if (mp->m_fsname != NULL)
+                kmem_free(mp->m_fsname, mp->m_fsname_len);
+        if (remove_bhv) {
+                struct vfs      *vfsp = XFS_MTOVFS(mp);
+                bhv_remove_all_vfsops(vfsp, 0);
+                VFS_REMOVEBHV(vfsp, &mp->m_bhv);
+        }
+        kmem_free(mp, sizeof(xfs_mount_t));
+}
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+        xfs_mount_t     *mp,
+        xfs_sb_t        *sbp)
+{
+        /*
+         * If the log device and data device have the
+         * same device number, the log is internal.
+         * Consequently, the sb_logstart should be non-zero.  If
+         * we have a zero sb_logstart in this case, we may be trying to mount
+         * a volume filesystem in a non-volume manner.
+         */
+        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+                cmn_err(CE_WARN, "XFS: bad magic number");
+                return XFS_ERROR(EWRONGFS);
+        }
+        if (!XFS_SB_GOOD_VERSION(sbp)) {
+                cmn_err(CE_WARN, "XFS: bad version");
+                return XFS_ERROR(EWRONGFS);
+        }
+        if (unlikely(
+            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
+                cmn_err(CE_WARN,
+        "XFS: filesystem is marked as having an external log; "
+        "specify logdev on the\nmount command line.");
+                XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(1)",
+                                     XFS_ERRLEVEL_HIGH, mp, sbp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (unlikely(
+            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
+                cmn_err(CE_WARN,
+        "XFS: filesystem is marked as having an internal log; "
+        "don't specify logdev on\nthe mount command line.");
+                XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(2)",
+                                     XFS_ERRLEVEL_HIGH, mp, sbp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /*
+         * More sanity checking. These were stolen directly from
+         * xfs_repair.
+         */
+        if (unlikely(
+            sbp->sb_agcount <= 0                                        ||
+            sbp->sb_sectsize < XFS_MIN_SECTORSIZE                       ||
+            sbp->sb_sectsize > XFS_MAX_SECTORSIZE                       ||
+            sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG                    ||
+            sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG                    ||
+            sbp->sb_blocksize < XFS_MIN_BLOCKSIZE                       ||
+            sbp->sb_blocksize > XFS_MAX_BLOCKSIZE                       ||
+            sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG                    ||
+            sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG                    ||
+            sbp->sb_inodesize < XFS_DINODE_MIN_SIZE                     ||
+            sbp->sb_inodesize > XFS_DINODE_MAX_SIZE                     ||
+            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
+            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
+            sbp->sb_imax_pct > 100)) {
+                cmn_err(CE_WARN, "XFS: SB sanity check 1 failed");
+                XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(3)",
+                                     XFS_ERRLEVEL_LOW, mp, sbp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /*
+         * Sanity check AG count, size fields against data size field
+         */
+        if (unlikely(
+            sbp->sb_dblocks == 0 ||
+            sbp->sb_dblocks >
+             (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
+            sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
+                              sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
+                cmn_err(CE_WARN, "XFS: SB sanity check 2 failed");
+                XFS_ERROR_REPORT("xfs_mount_validate_sb(4)",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
+        ASSERT(sbp->sb_blocklog >= BBSHIFT);
+#if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
+        if (unlikely(
+            (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX ||
+            (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) {
+#else                  /* Limited by UINT_MAX of sectors */
+        if (unlikely(
+            (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX ||
+            (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) {
+#endif
+                cmn_err(CE_WARN,
+        "XFS: File system is too large to be mounted on this system.");
+                return XFS_ERROR(E2BIG);
+        }
+        if (unlikely(sbp->sb_inprogress)) {
+                cmn_err(CE_WARN, "XFS: file system busy");
+                XFS_ERROR_REPORT("xfs_mount_validate_sb(5)",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        /*
+         * Until this is fixed only page-sized or smaller data blocks work.
+         */
+        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+                cmn_err(CE_WARN,
+                "XFS: Attempted to mount file system with blocksize %d bytes",
+                        sbp->sb_blocksize);
+                cmn_err(CE_WARN,
+                "XFS: Only page-sized (%d) or less blocksizes currently work.",
+                        PAGE_SIZE);
+                return XFS_ERROR(ENOSYS);
+        }
+        return 0;
+}
+xfs_agnumber_t
+xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount)
+{
+        xfs_agnumber_t  index, max_metadata;
+        xfs_perag_t     *pag;
+        xfs_agino_t     agino;
+        xfs_ino_t       ino;
+        xfs_sb_t        *sbp = &mp->m_sb;
+        xfs_ino_t       max_inum = XFS_MAXINUMBER_32;
+        /* Check to see if the filesystem can overflow 32 bit inodes */
+        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+        /* Clear the mount flag if no inode can overflow 32 bits
+         * on this filesystem, or if specifically requested..
+         */
+        if ((mp->m_flags & XFS_MOUNT_32BITINOOPT) && ino > max_inum) {
+                mp->m_flags |= XFS_MOUNT_32BITINODES;
+        } else {
+                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+        }
+        /* If we can overflow then setup the ag headers accordingly */
+        if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+                /* Calculate how much should be reserved for inodes to
+                 * meet the max inode percentage.
+                 */
+                if (mp->m_maxicount) {
+                        __uint64_t      icount;
+                        icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+                        do_div(icount, 100);
+                        icount += sbp->sb_agblocks - 1;
+                        do_div(icount, mp->m_ialloc_blks);
+                        max_metadata = icount;
+                } else {
+                        max_metadata = agcount;
+                }
+                for (index = 0; index < agcount; index++) {
+                        ino = XFS_AGINO_TO_INO(mp, index, agino);
+                        if (ino > max_inum) {
+                                index++;
+                                break;
+                        }
+                        /* This ag is prefered for inodes */
+                        pag = &mp->m_perag[index];
+                        pag->pagi_inodeok = 1;
+                        if (index < max_metadata)
+                                pag->pagf_metadata = 1;
+                }
+        } else {
+                /* Setup default behavior for smaller filesystems */
+                for (index = 0; index < agcount; index++) {
+                        pag = &mp->m_perag[index];
+                        pag->pagi_inodeok = 1;
+                }
+        }
+        return index;
+}
+/*
+ * xfs_xlatesb
+ *
+ *     data       - on disk version of sb
+ *     sb         - a superblock
+ *     dir        - conversion direction: <0 - convert sb to buf
+ *                                        >0 - convert buf to sb
+ *     fields     - which fields to copy (bitmask)
+ */
+void
+xfs_xlatesb(
+        void            *data,
+        xfs_sb_t        *sb,
+        int             dir,
+        __int64_t       fields)
+{
+        xfs_caddr_t     buf_ptr;
+        xfs_caddr_t     mem_ptr;
+        xfs_sb_field_t  f;
+        int             first;
+        int             size;
+        ASSERT(dir);
+        ASSERT(fields);
+        if (!fields)
+                return;
+        buf_ptr = (xfs_caddr_t)data;
+        mem_ptr = (xfs_caddr_t)sb;
+        while (fields) {
+                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+                first = xfs_sb_info[f].offset;
+                size = xfs_sb_info[f + 1].offset - first;
+                ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+                if (size == 1 || xfs_sb_info[f].type == 1) {
+                        if (dir > 0) {
+                                memcpy(mem_ptr + first, buf_ptr + first, size);
+                        } else {
+                                memcpy(buf_ptr + first, mem_ptr + first, size);
+                        }
+                } else {
+                        switch (size) {
+                        case 2:
+                                INT_XLATE(*(__uint16_t*)(buf_ptr+first),
+                                          *(__uint16_t*)(mem_ptr+first),
+                                          dir, ARCH_CONVERT);
+                                break;
+                        case 4:
+                                INT_XLATE(*(__uint32_t*)(buf_ptr+first),
+                                          *(__uint32_t*)(mem_ptr+first),
+                                          dir, ARCH_CONVERT);
+                                break;
+                        case 8:
+                                INT_XLATE(*(__uint64_t*)(buf_ptr+first),
+                                          *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT);
+                                break;
+                        default:
+                                ASSERT(0);
+                        }
+                }
+                fields &= ~(1LL << f);
+        }
+}
+/*
+ * xfs_readsb
+ *
+ * Does the initial read of the superblock.
+ */
+int
+xfs_readsb(xfs_mount_t *mp)
+{
+        unsigned int    sector_size;
+        unsigned int    extra_flags;
+        xfs_buf_t       *bp;
+        xfs_sb_t        *sbp;
+        int             error;
+        ASSERT(mp->m_sb_bp == NULL);
+        ASSERT(mp->m_ddev_targp != NULL);
+        /*
+         * Allocate a (locked) buffer to hold the superblock.
+         * This will be kept around at all times to optimize
+         * access to the superblock.
+         */
+        sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+        extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+        bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
+                                BTOBB(sector_size), extra_flags);
+        if (!bp || XFS_BUF_ISERROR(bp)) {
+                cmn_err(CE_WARN, "XFS: SB read failed");
+                error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
+                goto fail;
+        }
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+        /*
+         * Initialize the mount structure from the superblock.
+         * But first do some basic consistency checking.
+         */
+        sbp = XFS_BUF_TO_SBP(bp);
+        xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS);
+        error = xfs_mount_validate_sb(mp, &(mp->m_sb));
+        if (error) {
+                cmn_err(CE_WARN, "XFS: SB validate failed");
+                goto fail;
+        }
+        /*
+         * We must be able to do sector-sized and sector-aligned IO.
+         */
+        if (sector_size > mp->m_sb.sb_sectsize) {
+                cmn_err(CE_WARN,
+                        "XFS: device supports only %u byte sectors (not %u)",
+                        sector_size, mp->m_sb.sb_sectsize);
+                error = ENOSYS;
+                goto fail;
+        }
+        /*
+         * If device sector size is smaller than the superblock size,
+         * re-read the superblock so the buffer is correctly sized.
+         */
+        if (sector_size < mp->m_sb.sb_sectsize) {
+                XFS_BUF_UNMANAGE(bp);
+                xfs_buf_relse(bp);
+                sector_size = mp->m_sb.sb_sectsize;
+                bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR,
+                                        BTOBB(sector_size), extra_flags);
+                if (!bp || XFS_BUF_ISERROR(bp)) {
+                        cmn_err(CE_WARN, "XFS: SB re-read failed");
+                        error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM;
+                        goto fail;
+                }
+                ASSERT(XFS_BUF_ISBUSY(bp));
+                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+        }
+        mp->m_sb_bp = bp;
+        xfs_buf_relse(bp);
+        ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
+        return 0;
+ fail:
+        if (bp) {
+                XFS_BUF_UNMANAGE(bp);
+                xfs_buf_relse(bp);
+        }
+        return error;
+}
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
+{
+        int     i;
+        mp->m_agfrotor = mp->m_agirotor = 0;
+        spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock");
+        mp->m_maxagi = mp->m_sb.sb_agcount;
+        mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+        mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
+        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+        mp->m_litino = sbp->sb_inodesize -
+                ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
+        mp->m_blockmask = sbp->sb_blocksize - 1;
+        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+        mp->m_blockwmask = mp->m_blockwsize - 1;
+        INIT_LIST_HEAD(&mp->m_del_inodes);
+        /*
+         * Setup for attributes, in case they get created.
+         * This value is for inodes getting attributes for the first time,
+         * the per-inode value is for old attribute values.
+         */
+        ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
+        switch (sbp->sb_inodesize) {
+        case 256:
+                mp->m_attroffset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(2);
+                break;
+        case 512:
+        case 1024:
+        case 2048:
+                mp->m_attroffset = XFS_BMDR_SPACE_CALC(12);
+                break;
+        default:
+                ASSERT(0);
+        }
+        ASSERT(mp->m_attroffset < XFS_LITINO(mp));
+        for (i = 0; i < 2; i++) {
+                mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+                        xfs_alloc, i == 0);
+                mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+                        xfs_alloc, i == 0);
+        }
+        for (i = 0; i < 2; i++) {
+                mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+                        xfs_bmbt, i == 0);
+                mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+                        xfs_bmbt, i == 0);
+        }
+        for (i = 0; i < 2; i++) {
+                mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+                        xfs_inobt, i == 0);
+                mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+                        xfs_inobt, i == 0);
+        }
+        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+                                        sbp->sb_inopblock);
+        mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+/*
+ * xfs_mountfs
+ *
+ * This function does the following on an initial mount of a file system:
+ *      - reads the superblock from disk and init the mount struct
+ *      - if we're a 32-bit kernel, do a size check on the superblock
+ *              so we don't mount terabyte filesystems
+ *      - init mount struct realtime fields
+ *      - allocate inode hash table for fs
+ *      - init directory manager
+ *      - perform recovery and init the log manager
+ */
+int
+xfs_mountfs(
+        vfs_t           *vfsp,
+        xfs_mount_t     *mp,
+        int             mfsi_flags)
+{
+        xfs_buf_t       *bp;
+        xfs_sb_t        *sbp = &(mp->m_sb);
+        xfs_inode_t     *rip;
+        vnode_t         *rvp = NULL;
+        int             readio_log, writeio_log;
+        xfs_daddr_t     d;
+        __uint64_t      ret64;
+        __int64_t       update_flags;
+        uint            quotamount, quotaflags;
+        int             agno;
+        int             uuid_mounted = 0;
+        int             error = 0;
+        if (mp->m_sb_bp == NULL) {
+                if ((error = xfs_readsb(mp))) {
+                        return (error);
+                }
+        }
+        xfs_mount_common(mp, sbp);
+        /*
+         * Check if sb_agblocks is aligned at stripe boundary
+         * If sb_agblocks is NOT aligned turn off m_dalign since
+         * allocator alignment is within an ag, therefore ag has
+         * to be aligned at stripe boundary.
+         */
+        update_flags = 0LL;
+        if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+                /*
+                 * If stripe unit and stripe width are not multiples
+                 * of the fs blocksize turn off alignment.
+                 */
+                if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+                    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+                        if (mp->m_flags & XFS_MOUNT_RETERR) {
+                                cmn_err(CE_WARN,
+                                        "XFS: alignment check 1 failed");
+                                error = XFS_ERROR(EINVAL);
+                                goto error1;
+                        }
+                        mp->m_dalign = mp->m_swidth = 0;
+                } else {
+                        /*
+                         * Convert the stripe unit and width to FSBs.
+                         */
+                        mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+                        if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
+                                if (mp->m_flags & XFS_MOUNT_RETERR) {
+                                        error = XFS_ERROR(EINVAL);
+                                        goto error1;
+                                }
+                                xfs_fs_cmn_err(CE_WARN, mp,
+"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
+                                        mp->m_dalign, mp->m_swidth,
+                                        sbp->sb_agblocks);
+                                mp->m_dalign = 0;
+                                mp->m_swidth = 0;
+                        } else if (mp->m_dalign) {
+                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+                        } else {
+                                if (mp->m_flags & XFS_MOUNT_RETERR) {
+                                        xfs_fs_cmn_err(CE_WARN, mp,
+"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+                                                mp->m_dalign,
+                                                mp->m_blockmask +1);
+                                        error = XFS_ERROR(EINVAL);
+                                        goto error1;
+                                }
+                                mp->m_swidth = 0;
+                        }
+                }
+                /*
+                 * Update superblock with new values
+                 * and log changes
+                 */
+                if (XFS_SB_VERSION_HASDALIGN(sbp)) {
+                        if (sbp->sb_unit != mp->m_dalign) {
+                                sbp->sb_unit = mp->m_dalign;
+                                update_flags |= XFS_SB_UNIT;
+                        }
+                        if (sbp->sb_width != mp->m_swidth) {
+                                sbp->sb_width = mp->m_swidth;
+                                update_flags |= XFS_SB_WIDTH;
+                        }
+                }
+        } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
+                    XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) {
+                        mp->m_dalign = sbp->sb_unit;
+                        mp->m_swidth = sbp->sb_width;
+        }
+        xfs_alloc_compute_maxlevels(mp);
+        xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
+        xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
+        xfs_ialloc_compute_maxlevels(mp);
+        if (sbp->sb_imax_pct) {
+                __uint64_t      icount;
+                /* Make sure the maximum inode count is a multiple of the
+                 * units we allocate inodes in.
+                 */
+                icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+                do_div(icount, 100);
+                do_div(icount, mp->m_ialloc_blks);
+                mp->m_maxicount = (icount * mp->m_ialloc_blks)  <<
+                                   sbp->sb_inopblog;
+        } else
+                mp->m_maxicount = 0;
+        mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog);
+        /*
+         * XFS uses the uuid from the superblock as the unique
+         * identifier for fsid.  We can not use the uuid from the volume
+         * since a single partition filesystem is identical to a single
+         * partition volume/filesystem.
+         */
+        if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
+            (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
+                if (xfs_uuid_mount(mp)) {
+                        error = XFS_ERROR(EINVAL);
+                        goto error1;
+                }
+                uuid_mounted=1;
+                ret64 = uuid_hash64(&sbp->sb_uuid);
+                memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64));
+        }
+        /*
+         * Set the default minimum read and write sizes unless
+         * already specified in a mount option.
+         * We use smaller I/O sizes when the file system
+         * is being used for NFS service (wsync mount option).
+         */
+        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                if (mp->m_flags & XFS_MOUNT_WSYNC) {
+                        readio_log = XFS_WSYNC_READIO_LOG;
+                        writeio_log = XFS_WSYNC_WRITEIO_LOG;
+                } else {
+                        readio_log = XFS_READIO_LOG_LARGE;
+                        writeio_log = XFS_WRITEIO_LOG_LARGE;
+                }
+        } else {
+                readio_log = mp->m_readio_log;
+                writeio_log = mp->m_writeio_log;
+        }
+        /*
+         * Set the number of readahead buffers to use based on
+         * physical memory size.
+         */
+        if (xfs_physmem <= 4096)                /* <= 16MB */
+                mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB;
+        else if (xfs_physmem <= 8192)   /* <= 32MB */
+                mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB;
+        else
+                mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32;
+        if (sbp->sb_blocklog > readio_log) {
+                mp->m_readio_log = sbp->sb_blocklog;
+        } else {
+                mp->m_readio_log = readio_log;
+        }
+        mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
+        if (sbp->sb_blocklog > writeio_log) {
+                mp->m_writeio_log = sbp->sb_blocklog;
+        } else {
+                mp->m_writeio_log = writeio_log;
+        }
+        mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
+        /*
+         * Set the inode cluster size based on the physical memory
+         * size.  This may still be overridden by the file system
+         * block size if it is larger than the chosen cluster size.
+         */
+        if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */
+                mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE;
+        } else {
+                mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+        }
+        /*
+         * Set whether we're using inode alignment.
+         */
+        if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) &&
+            mp->m_sb.sb_inoalignmt >=
+            XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+                mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+        else
+                mp->m_inoalign_mask = 0;
+        /*
+         * If we are using stripe alignment, check whether
+         * the stripe unit is a multiple of the inode alignment
+         */
+        if (mp->m_dalign && mp->m_inoalign_mask &&
+            !(mp->m_dalign & mp->m_inoalign_mask))
+                mp->m_sinoalign = mp->m_dalign;
+        else
+                mp->m_sinoalign = 0;
+        /*
+         * Check that the data (and log if separate) are an ok size.
+         */
+        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
+                cmn_err(CE_WARN, "XFS: size check 1 failed");
+                error = XFS_ERROR(E2BIG);
+                goto error1;
+        }
+        error = xfs_read_buf(mp, mp->m_ddev_targp,
+                             d - XFS_FSS_TO_BB(mp, 1),
+                             XFS_FSS_TO_BB(mp, 1), 0, &bp);
+        if (!error) {
+                xfs_buf_relse(bp);
+        } else {
+                cmn_err(CE_WARN, "XFS: size check 2 failed");
+                if (error == ENOSPC) {
+                        error = XFS_ERROR(E2BIG);
+                }
+                goto error1;
+        }
+        if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
+            mp->m_logdev_targp != mp->m_ddev_targp) {
+                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+                        cmn_err(CE_WARN, "XFS: size check 3 failed");
+                        error = XFS_ERROR(E2BIG);
+                        goto error1;
+                }
+                error = xfs_read_buf(mp, mp->m_logdev_targp,
+                                     d - XFS_FSB_TO_BB(mp, 1),
+                                     XFS_FSB_TO_BB(mp, 1), 0, &bp);
+                if (!error) {
+                        xfs_buf_relse(bp);
+                } else {
+                        cmn_err(CE_WARN, "XFS: size check 3 failed");
+                        if (error == ENOSPC) {
+                                error = XFS_ERROR(E2BIG);
+                        }
+                        goto error1;
+                }
+        }
+        /*
+         * Initialize realtime fields in the mount structure
+         */
+        if ((error = xfs_rtmount_init(mp))) {
+                cmn_err(CE_WARN, "XFS: RT mount failed");
+                goto error1;
+        }
+        /*
+         * For client case we are done now
+         */
+        if (mfsi_flags & XFS_MFSI_CLIENT) {
+                return(0);
+        }
+        /*
+         *  Copies the low order bits of the timestamp and the randomly
+         *  set "sequence" number out of a UUID.
+         */
+        uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
+        /*
+         *  The vfs structure needs to have a file system independent
+         *  way of checking for the invariant file system ID.  Since it
+         *  can't look at mount structures it has a pointer to the data
+         *  in the mount structure.
+         *
+         *  File systems that don't support user level file handles (i.e.
+         *  all of them except for XFS) will leave vfs_altfsid as NULL.
+         */
+        vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid;
+        mp->m_dmevmask = 0;     /* not persistent; set after each mount */
+        /*
+         * Select the right directory manager.
+         */
+        mp->m_dirops =
+                XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
+                        xfsv2_dirops :
+                        xfsv1_dirops;
+        /*
+         * Initialize directory manager's entries.
+         */
+        XFS_DIR_MOUNT(mp);
+        /*
+         * Initialize the attribute manager's entries.
+         */
+        mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+        /*
+         * Initialize the precomputed transaction reservations values.
+         */
+        xfs_trans_init(mp);
+        /*
+         * Allocate and initialize the inode hash table for this
+         * file system.
+         */
+        xfs_ihash_init(mp);
+        xfs_chash_init(mp);
+        /*
+         * Allocate and initialize the per-ag data.
+         */
+        init_rwsem(&mp->m_peraglock);
+        mp->m_perag =
+                kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+        mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
+        /*
+         * log's mount-time initialization. Perform 1st part recovery if needed
+         */
+        if (likely(sbp->sb_logblocks > 0)) {    /* check for volume case */
+                error = xfs_log_mount(mp, mp->m_logdev_targp,
+                                      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+                                      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+                if (error) {
+                        cmn_err(CE_WARN, "XFS: log mount failed");
+                        goto error2;
+                }
+        } else {        /* No log has been defined */
+                cmn_err(CE_WARN, "XFS: no log defined");
+                XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp);
+                error = XFS_ERROR(EFSCORRUPTED);
+                goto error2;
+        }
+        /*
+         * Get and sanity-check the root inode.
+         * Save the pointer to it in the mount structure.
+         */
+        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0);
+        if (error) {
+                cmn_err(CE_WARN, "XFS: failed to read root inode");
+                goto error3;
+        }
+        ASSERT(rip != NULL);
+        rvp = XFS_ITOV(rip);
+        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
+                cmn_err(CE_WARN, "XFS: corrupted root inode");
+                prdev("Root inode %llu is not a directory",
+                      mp->m_ddev_targp, (unsigned long long)rip->i_ino);
+                xfs_iunlock(rip, XFS_ILOCK_EXCL);
+                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
+                                 mp);
+                error = XFS_ERROR(EFSCORRUPTED);
+                goto error4;
+        }
+        mp->m_rootip = rip;     /* save it */
+        xfs_iunlock(rip, XFS_ILOCK_EXCL);
+        /*
+         * Initialize realtime inode pointers in the mount structure
+         */
+        if ((error = xfs_rtmount_inodes(mp))) {
+                /*
+                 * Free up the root inode.
+                 */
+                cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+                goto error4;
+        }
+        /*
+         * If fs is not mounted readonly, then update the superblock
+         * unit and width changes.
+         */
+        if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY))
+                xfs_mount_log_sbunit(mp, update_flags);
+        /*
+         * Initialise the XFS quota management subsystem for this mount
+         */
+        if ((error = XFS_QM_INIT(mp, &quotamount, &quotaflags)))
+                goto error4;
+        /*
+         * Finish recovering the file system.  This part needed to be
+         * delayed until after the root and real-time bitmap inodes
+         * were consistently read in.
+         */
+        error = xfs_log_mount_finish(mp, mfsi_flags);
+        if (error) {
+                cmn_err(CE_WARN, "XFS: log mount finish failed");
+                goto error4;
+        }
+        /*
+         * Complete the quota initialisation, post-log-replay component.
+         */
+        if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags)))
+                goto error4;
+        return 0;
+ error4:
+        /*
+         * Free up the root inode.
+         */
+        VN_RELE(rvp);
+ error3:
+        xfs_log_unmount_dealloc(mp);
+ error2:
+        xfs_ihash_free(mp);
+        xfs_chash_free(mp);
+        for (agno = 0; agno < sbp->sb_agcount; agno++)
+                if (mp->m_perag[agno].pagb_list)
+                        kmem_free(mp->m_perag[agno].pagb_list,
+                          sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
+        kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
+        mp->m_perag = NULL;
+        /* FALLTHROUGH */
+ error1:
+        if (uuid_mounted)
+                xfs_uuid_unmount(mp);
+        xfs_freesb(mp);
+        return error;
+}
+/*
+ * xfs_unmountfs
+ *
+ * This flushes out the inodes,dquots and the superblock, unmounts the
+ * log and makes sure that incore structures are freed.
+ */
+int
+xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
+{
+        struct vfs      *vfsp = XFS_MTOVFS(mp);
+#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+        int64_t         fsid;
+#endif
+        xfs_iflush_all(mp, XFS_FLUSH_ALL);
+        XFS_QM_DQPURGEALL(mp,
+                XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING);
+        /*
+         * Flush out the log synchronously so that we know for sure
+         * that nothing is pinned.  This is important because bflush()
+         * will skip pinned buffers.
+         */
+        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+        xfs_binval(mp->m_ddev_targp);
+        if (mp->m_rtdev_targp) {
+                xfs_binval(mp->m_rtdev_targp);
+        }
+        xfs_unmountfs_writesb(mp);
+        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
+        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
+        xfs_freesb(mp);
+        /*
+         * All inodes from this mount point should be freed.
+         */
+        ASSERT(mp->m_inodes == NULL);
+        /*
+         * We may have bufs that are in the process of getting written still.
+         * We must wait for the I/O completion of those. The sync flag here
+         * does a two pass iteration thru the bufcache.
+         */
+        if (XFS_FORCED_SHUTDOWN(mp)) {
+                xfs_incore_relse(mp->m_ddev_targp, 0, 1); /* synchronous */
+        }
+        xfs_unmountfs_close(mp, cr);
+        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
+                xfs_uuid_unmount(mp);
+#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+        /*
+         * clear all error tags on this filesystem
+         */
+        memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t));
+        xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0);
+#endif
+        XFS_IODONE(vfsp);
+        xfs_mount_free(mp, 1);
+        return 0;
+}
+void
+xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
+{
+        if (mp->m_logdev_targp != mp->m_ddev_targp)
+                xfs_free_buftarg(mp->m_logdev_targp, 1);
+        if (mp->m_rtdev_targp)
+                xfs_free_buftarg(mp->m_rtdev_targp, 1);
+        xfs_free_buftarg(mp->m_ddev_targp, 0);
+}
+void
+xfs_unmountfs_wait(xfs_mount_t *mp)
+{
+        if (mp->m_logdev_targp != mp->m_ddev_targp)
+                xfs_wait_buftarg(mp->m_logdev_targp);
+        if (mp->m_rtdev_targp)
+                xfs_wait_buftarg(mp->m_rtdev_targp);
+        xfs_wait_buftarg(mp->m_ddev_targp);
+}
+int
+xfs_unmountfs_writesb(xfs_mount_t *mp)
+{
+        xfs_buf_t       *sbp;
+        xfs_sb_t        *sb;
+        int             error = 0;
+        /*
+         * skip superblock write if fs is read-only, or
+         * if we are doing a forced umount.
+         */
+        sbp = xfs_getsb(mp, 0);
+        if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
+                XFS_FORCED_SHUTDOWN(mp))) {
+                /*
+                 * mark shared-readonly if desired
+                 */
+                sb = XFS_BUF_TO_SBP(sbp);
+                if (mp->m_mk_sharedro) {
+                        if (!(sb->sb_flags & XFS_SBF_READONLY))
+                                sb->sb_flags |= XFS_SBF_READONLY;
+                        if (!XFS_SB_VERSION_HASSHARED(sb))
+                                XFS_SB_VERSION_ADDSHARED(sb);
+                        xfs_fs_cmn_err(CE_NOTE, mp,
+                                "Unmounting, marking shared read-only");
+                }
+                XFS_BUF_UNDONE(sbp);
+                XFS_BUF_UNREAD(sbp);
+                XFS_BUF_UNDELAYWRITE(sbp);
+                XFS_BUF_WRITE(sbp);
+                XFS_BUF_UNASYNC(sbp);
+                ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
+                xfsbdstrat(mp, sbp);
+                /* Nevermind errors we might get here. */
+                error = xfs_iowait(sbp);
+                if (error)
+                        xfs_ioerror_alert("xfs_unmountfs_writesb",
+                                          mp, sbp, XFS_BUF_ADDR(sbp));
+                if (error && mp->m_mk_sharedro)
+                        xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
+        }
+        xfs_buf_relse(sbp);
+        return (error);
+}
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+        xfs_buf_t       *bp;
+        int             first;
+        int             last;
+        xfs_mount_t     *mp;
+        xfs_sb_t        *sbp;
+        xfs_sb_field_t  f;
+        ASSERT(fields);
+        if (!fields)
+                return;
+        mp = tp->t_mountp;
+        bp = xfs_trans_getsb(tp, mp, 0);
+        sbp = XFS_BUF_TO_SBP(bp);
+        first = sizeof(xfs_sb_t);
+        last = 0;
+        /* translate/copy */
+        xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields);
+        /* find modified range */
+        f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        first = xfs_sb_info[f].offset;
+        f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        last = xfs_sb_info[f + 1].offset - 1;
+        xfs_trans_log_buf(tp, bp, first, last);
+}
+/*
+ * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * a delta to a specified field in the in-core superblock.  Simply
+ * switch on the field indicated and apply the delta to that field.
+ * Fields are not allowed to dip below zero, so if the delta would
+ * do this do not apply it and return EINVAL.
+ *
+ * The SB_LOCK must be held when this routine is called.
+ */
+STATIC int
+xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
+                        int delta, int rsvd)
+{
+        int             scounter;       /* short counter for 32 bit fields */
+        long long       lcounter;       /* long counter for 64 bit fields */
+        long long       res_used, rem;
+        /*
+         * With the in-core superblock spin lock held, switch
+         * on the indicated field.  Apply the delta to the
+         * proper field.  If the fields value would dip below
+         * 0, then do not apply the delta and return EINVAL.
+         */
+        switch (field) {
+        case XFS_SBS_ICOUNT:
+                lcounter = (long long)mp->m_sb.sb_icount;
+                lcounter += delta;
+                if (lcounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_icount = lcounter;
+                return (0);
+        case XFS_SBS_IFREE:
+                lcounter = (long long)mp->m_sb.sb_ifree;
+                lcounter += delta;
+                if (lcounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_ifree = lcounter;
+                return (0);
+        case XFS_SBS_FDBLOCKS:
+                lcounter = (long long)mp->m_sb.sb_fdblocks;
+                res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+                if (delta > 0) {                /* Putting blocks back */
+                        if (res_used > delta) {
+                                mp->m_resblks_avail += delta;
+                        } else {
+                                rem = delta - res_used;
+                                mp->m_resblks_avail = mp->m_resblks;
+                                lcounter += rem;
+                        }
+                } else {                                /* Taking blocks away */
+                        lcounter += delta;
+                /*
+                 * If were out of blocks, use any available reserved blocks if
+                 * were allowed to.
+                 */
+                        if (lcounter < 0) {
+                                if (rsvd) {
+                                        lcounter = (long long)mp->m_resblks_avail + delta;
+                                        if (lcounter < 0) {
+                                                return (XFS_ERROR(ENOSPC));
+                                        }
+                                        mp->m_resblks_avail = lcounter;
+                                        return (0);
+                                } else {        /* not reserved */
+                                        return (XFS_ERROR(ENOSPC));
+                                }
+                        }
+                }
+                mp->m_sb.sb_fdblocks = lcounter;
+                return (0);
+        case XFS_SBS_FREXTENTS:
+                lcounter = (long long)mp->m_sb.sb_frextents;
+                lcounter += delta;
+                if (lcounter < 0) {
+                        return (XFS_ERROR(ENOSPC));
+                }
+                mp->m_sb.sb_frextents = lcounter;
+                return (0);
+        case XFS_SBS_DBLOCKS:
+                lcounter = (long long)mp->m_sb.sb_dblocks;
+                lcounter += delta;
+                if (lcounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_dblocks = lcounter;
+                return (0);
+        case XFS_SBS_AGCOUNT:
+                scounter = mp->m_sb.sb_agcount;
+                scounter += delta;
+                if (scounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_agcount = scounter;
+                return (0);
+        case XFS_SBS_IMAX_PCT:
+                scounter = mp->m_sb.sb_imax_pct;
+                scounter += delta;
+                if (scounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_imax_pct = scounter;
+                return (0);
+        case XFS_SBS_REXTSIZE:
+                scounter = mp->m_sb.sb_rextsize;
+                scounter += delta;
+                if (scounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_rextsize = scounter;
+                return (0);
+        case XFS_SBS_RBMBLOCKS:
+                scounter = mp->m_sb.sb_rbmblocks;
+                scounter += delta;
+                if (scounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_rbmblocks = scounter;
+                return (0);
+        case XFS_SBS_RBLOCKS:
+                lcounter = (long long)mp->m_sb.sb_rblocks;
+                lcounter += delta;
+                if (lcounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_rblocks = lcounter;
+                return (0);
+        case XFS_SBS_REXTENTS:
+                lcounter = (long long)mp->m_sb.sb_rextents;
+                lcounter += delta;
+                if (lcounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_rextents = lcounter;
+                return (0);
+        case XFS_SBS_REXTSLOG:
+                scounter = mp->m_sb.sb_rextslog;
+                scounter += delta;
+                if (scounter < 0) {
+                        ASSERT(0);
+                        return (XFS_ERROR(EINVAL));
+                }
+                mp->m_sb.sb_rextslog = scounter;
+                return (0);
+        default:
+                ASSERT(0);
+                return (XFS_ERROR(EINVAL));
+        }
+}
+/*
+ * xfs_mod_incore_sb() is used to change a field in the in-core
+ * superblock structure by the specified delta.  This modification
+ * is protected by the SB_LOCK.  Just use the xfs_mod_incore_sb_unlocked()
+ * routine to do the work.
+ */
+int
+xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
+{
+        unsigned long   s;
+        int     status;
+        s = XFS_SB_LOCK(mp);
+        status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+        XFS_SB_UNLOCK(mp, s);
+        return (status);
+}
+/*
+ * xfs_mod_incore_sb_batch() is used to change more than one field
+ * in the in-core superblock structure at a time.  This modification
+ * is protected by a lock internal to this module.  The fields and
+ * changes to those fields are specified in the array of xfs_mod_sb
+ * structures passed in.
+ *
+ * Either all of the specified deltas will be applied or none of
+ * them will.  If any modified field dips below 0, then all modifications
+ * will be backed out and EINVAL will be returned.
+ */
+int
+xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+{
+        unsigned long   s;
+        int             status=0;
+        xfs_mod_sb_t    *msbp;
+        /*
+         * Loop through the array of mod structures and apply each
+         * individually.  If any fail, then back out all those
+         * which have already been applied.  Do all of this within
+         * the scope of the SB_LOCK so that all of the changes will
+         * be atomic.
+         */
+        s = XFS_SB_LOCK(mp);
+        msbp = &msb[0];
+        for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+                /*
+                 * Apply the delta at index n.  If it fails, break
+                 * from the loop so we'll fall into the undo loop
+                 * below.
+                 */
+                status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+                                                    msbp->msb_delta, rsvd);
+                if (status != 0) {
+                        break;
+                }
+        }
+        /*
+         * If we didn't complete the loop above, then back out
+         * any changes made to the superblock.  If you add code
+         * between the loop above and here, make sure that you
+         * preserve the value of status. Loop back until
+         * we step below the beginning of the array.  Make sure
+         * we don't touch anything back there.
+         */
+        if (status != 0) {
+                msbp--;
+                while (msbp >= msb) {
+                        status = xfs_mod_incore_sb_unlocked(mp,
+                                    msbp->msb_field, -(msbp->msb_delta), rsvd);
+                        ASSERT(status == 0);
+                        msbp--;
+                }
+        }
+        XFS_SB_UNLOCK(mp, s);
+        return (status);
+}
+/*
+ * xfs_getsb() is called to obtain the buffer for the superblock.
+ * The buffer is returned locked and read in from disk.
+ * The buffer should be released with a call to xfs_brelse().
+ *
+ * If the flags parameter is BUF_TRYLOCK, then we'll only return
+ * the superblock buffer if it can be locked without sleeping.
+ * If it can't then we'll return NULL.
+ */
+xfs_buf_t *
+xfs_getsb(
+        xfs_mount_t     *mp,
+        int             flags)
+{
+        xfs_buf_t       *bp;
+        ASSERT(mp->m_sb_bp != NULL);
+        bp = mp->m_sb_bp;
+        if (flags & XFS_BUF_TRYLOCK) {
+                if (!XFS_BUF_CPSEMA(bp)) {
+                        return NULL;
+                }
+        } else {
+                XFS_BUF_PSEMA(bp, PRIBIO);
+        }
+        XFS_BUF_HOLD(bp);
+        ASSERT(XFS_BUF_ISDONE(bp));
+        return (bp);
+}
+/*
+ * Used to free the superblock along various error paths.
+ */
+void
+xfs_freesb(
+        xfs_mount_t     *mp)
+{
+        xfs_buf_t       *bp;
+        /*
+         * Use xfs_getsb() so that the buffer will be locked
+         * when we call xfs_buf_relse().
+         */
+        bp = xfs_getsb(mp, 0);
+        XFS_BUF_UNMANAGE(bp);
+        xfs_buf_relse(bp);
+        mp->m_sb_bp = NULL;
+}
+/*
+ * See if the UUID is unique among mounted XFS filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
+ */
+STATIC int
+xfs_uuid_mount(
+        xfs_mount_t     *mp)
+{
+        if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
+                cmn_err(CE_WARN,
+                        "XFS: Filesystem %s has nil UUID - can't mount",
+                        mp->m_fsname);
+                return -1;
+        }
+        if (!uuid_table_insert(&mp->m_sb.sb_uuid)) {
+                cmn_err(CE_WARN,
+                        "XFS: Filesystem %s has duplicate UUID - can't mount",
+                        mp->m_fsname);
+                return -1;
+        }
+        return 0;
+}
+/*
+ * Remove filesystem from the UUID table.
+ */
+STATIC void
+xfs_uuid_unmount(
+        xfs_mount_t     *mp)
+{
+        uuid_table_remove(&mp->m_sb.sb_uuid);
+}
+/*
+ * Used to log changes to the superblock unit and width fields which could
+ * be altered by the mount options. Only the first superblock is updated.
+ */
+STATIC void
+xfs_mount_log_sbunit(
+        xfs_mount_t     *mp,
+        __int64_t       fields)
+{
+        xfs_trans_t     *tp;
+        ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID));
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
+        if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                XFS_DEFAULT_LOG_COUNT)) {
+                xfs_trans_cancel(tp, 0);
+                return;
+        }
+        xfs_mod_sb(tp, fields);
+        xfs_trans_commit(tp, 0, NULL);
+}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
new file mode 100644
index 000000000000..5fc6201dd8e2
--- /dev/null
+++ b/fs/xfs/xfs_mount.h
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MOUNT_H__
+#define __XFS_MOUNT_H__
+typedef struct xfs_trans_reservations {
+        uint    tr_write;       /* extent alloc trans */
+        uint    tr_itruncate;   /* truncate trans */
+        uint    tr_rename;      /* rename trans */
+        uint    tr_link;        /* link trans */
+        uint    tr_remove;      /* unlink trans */
+        uint    tr_symlink;     /* symlink trans */
+        uint    tr_create;      /* create trans */
+        uint    tr_mkdir;       /* mkdir trans */
+        uint    tr_ifree;       /* inode free trans */
+        uint    tr_ichange;     /* inode update trans */
+        uint    tr_growdata;    /* fs data section grow trans */
+        uint    tr_swrite;      /* sync write inode trans */
+        uint    tr_addafork;    /* cvt inode to attributed trans */
+        uint    tr_writeid;     /* write setuid/setgid file */
+        uint    tr_attrinval;   /* attr fork buffer invalidation */
+        uint    tr_attrset;     /* set/create an attribute */
+        uint    tr_attrrm;      /* remove an attribute */
+        uint    tr_clearagi;    /* clear bad agi unlinked ino bucket */
+        uint    tr_growrtalloc; /* grow realtime allocations */
+        uint    tr_growrtzero;  /* grow realtime zeroing */
+        uint    tr_growrtfree;  /* grow realtime freeing */
+} xfs_trans_reservations_t;
+#ifndef __KERNEL__
+/*
+ * Moved here from xfs_ag.h to avoid reordering header files
+ */
+#define XFS_DADDR_TO_AGNO(mp,d) \
+        ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
+#define XFS_DADDR_TO_AGBNO(mp,d) \
+        ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
+#else
+struct cred;
+struct log;
+struct vfs;
+struct vnode;
+struct xfs_mount_args;
+struct xfs_ihash;
+struct xfs_chash;
+struct xfs_inode;
+struct xfs_perag;
+struct xfs_iocore;
+struct xfs_bmbt_irec;
+struct xfs_bmap_free;
+#define AIL_LOCK_T              lock_t
+#define AIL_LOCKINIT(x,y)       spinlock_init(x,y)
+#define AIL_LOCK_DESTROY(x)     spinlock_destroy(x)
+#define AIL_LOCK(mp,s)          s=mutex_spinlock(&(mp)->m_ail_lock)
+#define AIL_UNLOCK(mp,s)        mutex_spinunlock(&(mp)->m_ail_lock, s)
+/*
+ * Prototypes and functions for the Data Migration subsystem.
+ */
+typedef int     (*xfs_send_data_t)(int, struct vnode *,
+                        xfs_off_t, size_t, int, vrwlock_t *);
+typedef int     (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
+typedef int     (*xfs_send_destroy_t)(struct vnode *, dm_right_t);
+typedef int     (*xfs_send_namesp_t)(dm_eventtype_t, struct vfs *,
+                        struct vnode *,
+                        dm_right_t, struct vnode *, dm_right_t,
+                        char *, char *, mode_t, int, int);
+typedef void    (*xfs_send_unmount_t)(struct vfs *, struct vnode *,
+                        dm_right_t, mode_t, int, int);
+typedef struct xfs_dmops {
+        xfs_send_data_t         xfs_send_data;
+        xfs_send_mmap_t         xfs_send_mmap;
+        xfs_send_destroy_t      xfs_send_destroy;
+        xfs_send_namesp_t       xfs_send_namesp;
+        xfs_send_unmount_t      xfs_send_unmount;
+} xfs_dmops_t;
+#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \
+        (*(mp)->m_dm_ops.xfs_send_data)(ev,vp,off,len,fl,lock)
+#define XFS_SEND_MMAP(mp, vma,fl) \
+        (*(mp)->m_dm_ops.xfs_send_mmap)(vma,fl)
+#define XFS_SEND_DESTROY(mp, vp,right) \
+        (*(mp)->m_dm_ops.xfs_send_destroy)(vp,right)
+#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
+        (*(mp)->m_dm_ops.xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
+#define XFS_SEND_PREUNMOUNT(mp, vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
+        (*(mp)->m_dm_ops.xfs_send_namesp)(DM_EVENT_PREUNMOUNT,vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl)
+#define XFS_SEND_UNMOUNT(mp, vfsp,vp,right,mode,rval,fl) \
+        (*(mp)->m_dm_ops.xfs_send_unmount)(vfsp,vp,right,mode,rval,fl)
+/*
+ * Prototypes and functions for the Quota Management subsystem.
+ */
+struct xfs_dquot;
+struct xfs_dqtrxops;
+struct xfs_quotainfo;
+typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
+typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int);
+typedef int     (*xfs_qmunmount_t)(struct xfs_mount *);
+typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
+typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
+typedef int     (*xfs_dqattach_t)(struct xfs_inode *, uint);
+typedef void    (*xfs_dqdetach_t)(struct xfs_inode *);
+typedef int     (*xfs_dqpurgeall_t)(struct xfs_mount *, uint);
+typedef int     (*xfs_dqvopalloc_t)(struct xfs_mount *,
+                        struct xfs_inode *, uid_t, gid_t, uint,
+                        struct xfs_dquot **, struct xfs_dquot **);
+typedef void    (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *,
+                        struct xfs_dquot *, struct xfs_dquot *);
+typedef int     (*xfs_dqvoprename_t)(struct xfs_inode **);
+typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
+                        struct xfs_trans *, struct xfs_inode *,
+                        struct xfs_dquot **, struct xfs_dquot *);
+typedef int     (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
+                        struct xfs_dquot *, struct xfs_dquot *, uint);
+typedef struct xfs_qmops {
+        xfs_qminit_t            xfs_qminit;
+        xfs_qmdone_t            xfs_qmdone;
+        xfs_qmmount_t           xfs_qmmount;
+        xfs_qmunmount_t         xfs_qmunmount;
+        xfs_dqrele_t            xfs_dqrele;
+        xfs_dqattach_t          xfs_dqattach;
+        xfs_dqdetach_t          xfs_dqdetach;
+        xfs_dqpurgeall_t        xfs_dqpurgeall;
+        xfs_dqvopalloc_t        xfs_dqvopalloc;
+        xfs_dqvopcreate_t       xfs_dqvopcreate;
+        xfs_dqvoprename_t       xfs_dqvoprename;
+        xfs_dqvopchown_t        xfs_dqvopchown;
+        xfs_dqvopchownresv_t    xfs_dqvopchownresv;
+        struct xfs_dqtrxops     *xfs_dqtrxops;
+} xfs_qmops_t;
+#define XFS_QM_INIT(mp, mnt, fl) \
+        (*(mp)->m_qm_ops.xfs_qminit)(mp, mnt, fl)
+#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \
+        (*(mp)->m_qm_ops.xfs_qmmount)(mp, mnt, fl, mfsi_flags)
+#define XFS_QM_UNMOUNT(mp) \
+        (*(mp)->m_qm_ops.xfs_qmunmount)(mp)
+#define XFS_QM_DONE(mp) \
+        (*(mp)->m_qm_ops.xfs_qmdone)(mp)
+#define XFS_QM_DQRELE(mp, dq) \
+        (*(mp)->m_qm_ops.xfs_dqrele)(dq)
+#define XFS_QM_DQATTACH(mp, ip, fl) \
+        (*(mp)->m_qm_ops.xfs_dqattach)(ip, fl)
+#define XFS_QM_DQDETACH(mp, ip) \
+        (*(mp)->m_qm_ops.xfs_dqdetach)(ip)
+#define XFS_QM_DQPURGEALL(mp, fl) \
+        (*(mp)->m_qm_ops.xfs_dqpurgeall)(mp, fl)
+#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, fl, dq1, dq2) \
+        (*(mp)->m_qm_ops.xfs_dqvopalloc)(mp, ip, uid, gid, fl, dq1, dq2)
+#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \
+        (*(mp)->m_qm_ops.xfs_dqvopcreate)(tp, ip, dq1, dq2)
+#define XFS_QM_DQVOPRENAME(mp, ip) \
+        (*(mp)->m_qm_ops.xfs_dqvoprename)(ip)
+#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \
+        (*(mp)->m_qm_ops.xfs_dqvopchown)(tp, ip, dqp, dq)
+#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \
+        (*(mp)->m_qm_ops.xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl)
+/*
+ * Prototypes and functions for I/O core modularization.
+ */
+typedef int             (*xfs_ioinit_t)(struct vfs *,
+                                struct xfs_mount_args *, int);
+typedef int             (*xfs_bmapi_t)(struct xfs_trans *, void *,
+                                xfs_fileoff_t, xfs_filblks_t, int,
+                                xfs_fsblock_t *, xfs_extlen_t,
+                                struct xfs_bmbt_irec *, int *,
+                                struct xfs_bmap_free *);
+typedef int             (*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *);
+typedef int             (*xfs_iomap_write_direct_t)(
+                                void *, loff_t, size_t, int,
+                                struct xfs_bmbt_irec *, int *, int);
+typedef int             (*xfs_iomap_write_delay_t)(
+                                void *, loff_t, size_t, int,
+                                struct xfs_bmbt_irec *, int *);
+typedef int             (*xfs_iomap_write_allocate_t)(
+                                void *, struct xfs_bmbt_irec *, int *);
+typedef int             (*xfs_iomap_write_unwritten_t)(
+                                void *, loff_t, size_t);
+typedef uint            (*xfs_lck_map_shared_t)(void *);
+typedef void            (*xfs_lock_t)(void *, uint);
+typedef void            (*xfs_lock_demote_t)(void *, uint);
+typedef int             (*xfs_lock_nowait_t)(void *, uint);
+typedef void            (*xfs_unlk_t)(void *, unsigned int);
+typedef xfs_fsize_t     (*xfs_size_t)(void *);
+typedef xfs_fsize_t     (*xfs_iodone_t)(struct vfs *);
+typedef struct xfs_ioops {
+        xfs_ioinit_t                    xfs_ioinit;
+        xfs_bmapi_t                     xfs_bmapi_func;
+        xfs_bmap_eof_t                  xfs_bmap_eof_func;
+        xfs_iomap_write_direct_t        xfs_iomap_write_direct;
+        xfs_iomap_write_delay_t         xfs_iomap_write_delay;
+        xfs_iomap_write_allocate_t      xfs_iomap_write_allocate;
+        xfs_iomap_write_unwritten_t     xfs_iomap_write_unwritten;
+        xfs_lock_t                      xfs_ilock;
+        xfs_lck_map_shared_t            xfs_lck_map_shared;
+        xfs_lock_demote_t               xfs_ilock_demote;
+        xfs_lock_nowait_t               xfs_ilock_nowait;
+        xfs_unlk_t                      xfs_unlock;
+        xfs_size_t                      xfs_size_func;
+        xfs_iodone_t                    xfs_iodone;
+} xfs_ioops_t;
+#define XFS_IOINIT(vfsp, args, flags) \
+        (*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags)
+#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist)     \
+        (*(mp)->m_io_ops.xfs_bmapi_func) \
+                (trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist)
+#define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \
+        (*(mp)->m_io_ops.xfs_bmap_eof_func) \
+                ((io)->io_obj, endoff, whichfork, eof)
+#define XFS_IOMAP_WRITE_DIRECT(mp, io, offset, count, flags, mval, nmap, found)\
+        (*(mp)->m_io_ops.xfs_iomap_write_direct) \
+                ((io)->io_obj, offset, count, flags, mval, nmap, found)
+#define XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, flags, mval, nmap) \
+        (*(mp)->m_io_ops.xfs_iomap_write_delay) \
+                ((io)->io_obj, offset, count, flags, mval, nmap)
+#define XFS_IOMAP_WRITE_ALLOCATE(mp, io, mval, nmap) \
+        (*(mp)->m_io_ops.xfs_iomap_write_allocate) \
+                ((io)->io_obj, mval, nmap)
+#define XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count) \
+        (*(mp)->m_io_ops.xfs_iomap_write_unwritten) \
+                ((io)->io_obj, offset, count)
+#define XFS_LCK_MAP_SHARED(mp, io) \
+        (*(mp)->m_io_ops.xfs_lck_map_shared)((io)->io_obj)
+#define XFS_ILOCK(mp, io, mode) \
+        (*(mp)->m_io_ops.xfs_ilock)((io)->io_obj, mode)
+#define XFS_ILOCK_NOWAIT(mp, io, mode) \
+        (*(mp)->m_io_ops.xfs_ilock_nowait)((io)->io_obj, mode)
+#define XFS_IUNLOCK(mp, io, mode) \
+        (*(mp)->m_io_ops.xfs_unlock)((io)->io_obj, mode)
+#define XFS_ILOCK_DEMOTE(mp, io, mode) \
+        (*(mp)->m_io_ops.xfs_ilock_demote)((io)->io_obj, mode)
+#define XFS_SIZE(mp, io) \
+        (*(mp)->m_io_ops.xfs_size_func)((io)->io_obj)
+#define XFS_IODONE(vfsp) \
+        (*(mp)->m_io_ops.xfs_iodone)(vfsp)
+typedef struct xfs_mount {
+        bhv_desc_t              m_bhv;          /* vfs xfs behavior */
+        xfs_tid_t               m_tid;          /* next unused tid for fs */
+        AIL_LOCK_T              m_ail_lock;     /* fs AIL mutex */
+        xfs_ail_entry_t         m_ail;          /* fs active log item list */
+        uint                    m_ail_gen;      /* fs AIL generation count */
+        xfs_sb_t                m_sb;           /* copy of fs superblock */
+        lock_t                  m_sb_lock;      /* sb counter mutex */
+        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
+        char                    *m_fsname;      /* filesystem name */
+        int                     m_fsname_len;   /* strlen of fs name */
+        int                     m_bsize;        /* fs logical block size */
+        xfs_agnumber_t          m_agfrotor;     /* last ag where space found */
+        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
+        lock_t                  m_agirotor_lock;/* .. and lock protecting it */
+        xfs_agnumber_t          m_maxagi;       /* highest inode alloc group */
+        uint                    m_ihsize;       /* size of next field */
+        struct xfs_ihash        *m_ihash;       /* fs private inode hash table*/
+        struct xfs_inode        *m_inodes;      /* active inode list */
+        struct list_head        m_del_inodes;   /* inodes to reclaim */
+        mutex_t                 m_ilock;        /* inode list mutex */
+        uint                    m_ireclaims;    /* count of calls to reclaim*/
+        uint                    m_readio_log;   /* min read size log bytes */
+        uint                    m_readio_blocks; /* min read size blocks */
+        uint                    m_writeio_log;  /* min write size log bytes */
+        uint                    m_writeio_blocks; /* min write size blocks */
+        struct log              *m_log;         /* log specific stuff */
+        int                     m_logbufs;      /* number of log buffers */
+        int                     m_logbsize;     /* size of each log buffer */
+        uint                    m_rsumlevels;   /* rt summary levels */
+        uint                    m_rsumsize;     /* size of rt summary, bytes */
+        struct xfs_inode        *m_rbmip;       /* pointer to bitmap inode */
+        struct xfs_inode        *m_rsumip;      /* pointer to summary inode */
+        struct xfs_inode        *m_rootip;      /* pointer to root directory */
+        struct xfs_quotainfo    *m_quotainfo;   /* disk quota information */
+        xfs_buftarg_t           *m_ddev_targp;  /* saves taking the address */
+        xfs_buftarg_t           *m_logdev_targp;/* ptr to log device */
+        xfs_buftarg_t           *m_rtdev_targp; /* ptr to rt device */
+#define m_dev           m_ddev_targp->pbr_dev
+        __uint8_t               m_dircook_elog; /* log d-cookie entry bits */
+        __uint8_t               m_blkbit_log;   /* blocklog + NBBY */
+        __uint8_t               m_blkbb_log;    /* blocklog - BBSHIFT */
+        __uint8_t               m_agno_log;     /* log #ag's */
+        __uint8_t               m_agino_log;    /* #bits for agino in inum */
+        __uint8_t               m_nreadaheads;  /* #readahead buffers */
+        __uint16_t              m_inode_cluster_size;/* min inode buf size */
+        uint                    m_blockmask;    /* sb_blocksize-1 */
+        uint                    m_blockwsize;   /* sb_blocksize in words */
+        uint                    m_blockwmask;   /* blockwsize-1 */
+        uint                    m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
+        uint                    m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
+        uint                    m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
+        uint                    m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
+        uint                    m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
+        uint                    m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
+        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+        uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
+        struct xfs_perag        *m_perag;       /* per-ag accounting info */
+        struct rw_semaphore     m_peraglock;    /* lock for m_perag (pointer) */
+        sema_t                  m_growlock;     /* growfs mutex */
+        int                     m_fixedfsid[2]; /* unchanged for life of FS */
+        uint                    m_dmevmask;     /* DMI events for this FS */
+        uint                    m_flags;        /* global mount flags */
+        uint                    m_attroffset;   /* inode attribute offset */
+        uint                    m_dir_node_ents; /* #entries in a dir danode */
+        uint                    m_attr_node_ents; /* #entries in attr danode */
+        int                     m_ialloc_inos;  /* inodes in inode allocation */
+        int                     m_ialloc_blks;  /* blocks in inode allocation */
+        int                     m_litino;       /* size of inode union area */
+        int                     m_inoalign_mask;/* mask sb_inoalignmt if used */
+        uint                    m_qflags;       /* quota status flags */
+        xfs_trans_reservations_t m_reservations;/* precomputed res values */
+        __uint64_t              m_maxicount;    /* maximum inode count */
+        __uint64_t              m_maxioffset;   /* maximum inode offset */
+        __uint64_t              m_resblks;      /* total reserved blocks */
+        __uint64_t              m_resblks_avail;/* available reserved blocks */
+#if XFS_BIG_INUMS
+        xfs_ino_t               m_inoadd;       /* add value for ino64_offset */
+#endif
+        int                     m_dalign;       /* stripe unit */
+        int                     m_swidth;       /* stripe width */
+        int                     m_sinoalign;    /* stripe unit inode alignmnt */
+        int                     m_attr_magicpct;/* 37% of the blocksize */
+        int                     m_dir_magicpct; /* 37% of the dir blocksize */
+        __uint8_t               m_mk_sharedro;  /* mark shared ro on unmount */
+        __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
+                                                   field governed by m_ilock */
+        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
+        __uint8_t               m_dirversion;   /* 1 or 2 */
+        xfs_dirops_t            m_dirops;       /* table of dir funcs */
+        int                     m_dirblksize;   /* directory block sz--bytes */
+        int                     m_dirblkfsbs;   /* directory block sz--fsbs */
+        xfs_dablk_t             m_dirdatablk;   /* blockno of dir data v2 */
+        xfs_dablk_t             m_dirleafblk;   /* blockno of dir non-data v2 */
+        xfs_dablk_t             m_dirfreeblk;   /* blockno of dirfreeindex v2 */
+        uint                    m_chsize;       /* size of next field */
+        struct xfs_chash        *m_chash;       /* fs private inode per-cluster
+                                                 * hash table */
+        struct xfs_dmops        m_dm_ops;       /* vector of DMI ops */
+        struct xfs_qmops        m_qm_ops;       /* vector of XQM ops */
+        struct xfs_ioops        m_io_ops;       /* vector of I/O ops */
+        atomic_t                m_active_trans; /* number trans frozen */
+} xfs_mount_t;
+/*
+ * Flags for m_flags.
+ */
+#define XFS_MOUNT_WSYNC         0x00000001      /* for nfs - all metadata ops
+                                                   must be synchronous except
+                                                   for space allocations */
+#define XFS_MOUNT_INO64         0x00000002
+                             /* 0x00000004      -- currently unused */
+                             /* 0x00000008      -- currently unused */
+#define XFS_MOUNT_FS_SHUTDOWN   0x00000010      /* atomic stop of all filesystem
+                                                   operations, typically for
+                                                   disk errors in metadata */
+#define XFS_MOUNT_NOATIME       0x00000020      /* don't modify inode access
+                                                   times on reads */
+#define XFS_MOUNT_RETERR        0x00000040      /* return alignment errors to
+                                                   user */
+#define XFS_MOUNT_NOALIGN       0x00000080      /* turn off stripe alignment
+                                                   allocations */
+                             /* 0x00000100      -- currently unused */
+                             /* 0x00000200      -- currently unused */
+#define XFS_MOUNT_NORECOVERY    0x00000400      /* no recovery - dirty fs */
+#define XFS_MOUNT_SHARED        0x00000800      /* shared mount */
+#define XFS_MOUNT_DFLT_IOSIZE   0x00001000      /* set default i/o size */
+#define XFS_MOUNT_OSYNCISOSYNC  0x00002000      /* o_sync is REALLY o_sync */
+                                                /* osyncisdsync is now default*/
+#define XFS_MOUNT_32BITINODES   0x00004000      /* do not create inodes above
+                                                 * 32 bits in size */
+#define XFS_MOUNT_32BITINOOPT   0x00008000      /* saved mount option state */
+#define XFS_MOUNT_NOUUID        0x00010000      /* ignore uuid during mount */
+#define XFS_MOUNT_NOLOGFLUSH    0x00020000
+#define XFS_MOUNT_IDELETE       0x00040000      /* delete empty inode clusters*/
+#define XFS_MOUNT_SWALLOC       0x00080000      /* turn on stripe width
+                                                 * allocation */
+#define XFS_MOUNT_IHASHSIZE     0x00100000      /* inode hash table size */
+#define XFS_MOUNT_DIRSYNC       0x00200000      /* synchronous directory ops */
+/*
+ * Default minimum read and write sizes.
+ */
+#define XFS_READIO_LOG_LARGE    16
+#define XFS_WRITEIO_LOG_LARGE   16
+/*
+ * Max and min values for UIO and mount-option defined I/O sizes;
+ * min value can't be less than a page.  Currently unused.
+ */
+#define XFS_MAX_IO_LOG          16      /* 64K */
+#define XFS_MIN_IO_LOG          PAGE_SHIFT
+/*
+ * Synchronous read and write sizes.  This should be
+ * better for NFSv2 wsync filesystems.
+ */
+#define XFS_WSYNC_READIO_LOG    15      /* 32K */
+#define XFS_WSYNC_WRITEIO_LOG   14      /* 16K */
+#define XFS_MAXIOFFSET(mp)      ((mp)->m_maxioffset)
+#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
+#define xfs_force_shutdown(m,f) \
+        VFS_FORCE_SHUTDOWN((XFS_MTOVFS(m)), f, __FILE__, __LINE__)
+/*
+ * Flags sent to xfs_force_shutdown.
+ */
+#define XFS_METADATA_IO_ERROR   0x1
+#define XFS_LOG_IO_ERROR        0x2
+#define XFS_FORCE_UMOUNT        0x4
+#define XFS_CORRUPT_INCORE      0x8     /* Corrupt in-memory data structures */
+#define XFS_SHUTDOWN_REMOTE_REQ 0x10    /* Shutdown came from remote cell */
+/*
+ * xflags for xfs_syncsub
+ */
+#define XFS_XSYNC_RELOC         0x01
+/*
+ * Flags for xfs_mountfs
+ */
+#define XFS_MFSI_SECOND         0x01    /* Secondary mount -- skip stuff */
+#define XFS_MFSI_CLIENT         0x02    /* Is a client -- skip lots of stuff */
+#define XFS_MFSI_NOUNLINK       0x08    /* Skip unlinked inode processing in */
+                                        /* log recovery */
+#define XFS_MFSI_NO_QUOTACHECK  0x10    /* Skip quotacheck processing */
+/*
+ * Macros for getting from mount to vfs and back.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MTOVFS)
+struct vfs *xfs_mtovfs(xfs_mount_t *mp);
+#define XFS_MTOVFS(mp)          xfs_mtovfs(mp)
+#else
+#define XFS_MTOVFS(mp)          (bhvtovfs(&(mp)->m_bhv))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOM)
+xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp);
+#define XFS_BHVTOM(bdp) xfs_bhvtom(bdp)
+#else
+#define XFS_BHVTOM(bdp)         ((xfs_mount_t *)BHV_PDATA(bdp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_VFSTOM)
+xfs_mount_t *xfs_vfstom(vfs_t *vfs);
+#define XFS_VFSTOM(vfs) xfs_vfstom(vfs)
+#else
+#define XFS_VFSTOM(vfs)         \
+        (XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops)))
+#endif
+/*
+ * Moved here from xfs_ag.h to avoid reordering header files
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGNO)
+xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
+#else
+static inline xfs_agnumber_t XFS_DADDR_TO_AGNO(xfs_mount_t *mp, xfs_daddr_t d)
+{
+        d = XFS_BB_TO_FSBT(mp, d);
+        do_div(d, mp->m_sb.sb_agblocks);
+        return (xfs_agnumber_t) d;
+}
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGBNO)
+xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_AGBNO(mp,d)        xfs_daddr_to_agbno(mp,d)
+#else
+static inline xfs_agblock_t XFS_DADDR_TO_AGBNO(xfs_mount_t *mp, xfs_daddr_t d)
+{
+        d = XFS_BB_TO_FSBT(mp, d);
+        return (xfs_agblock_t) do_div(d, mp->m_sb.sb_agblocks);
+}
+#endif
+/*
+ * This structure is for use by the xfs_mod_incore_sb_batch() routine.
+ */
+typedef struct xfs_mod_sb {
+        xfs_sb_field_t  msb_field;      /* Field to modify, see below */
+        int             msb_delta;      /* Change to make to specified field */
+} xfs_mod_sb_t;
+#define XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock), PINOD)
+#define XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
+#define XFS_SB_LOCK(mp)         mutex_spinlock(&(mp)->m_sb_lock)
+#define XFS_SB_UNLOCK(mp,s)     mutex_spinunlock(&(mp)->m_sb_lock,(s))
+extern xfs_mount_t *xfs_mount_init(void);
+extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
+extern void     xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
+extern int      xfs_mountfs(struct vfs *, xfs_mount_t *mp, int);
+extern int      xfs_unmountfs(xfs_mount_t *, struct cred *);
+extern void     xfs_unmountfs_wait(xfs_mount_t *);
+extern void     xfs_unmountfs_close(xfs_mount_t *, struct cred *);
+extern int      xfs_unmountfs_writesb(xfs_mount_t *);
+extern int      xfs_unmount_flush(xfs_mount_t *, int);
+extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int);
+extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
+                        uint, int);
+extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
+extern int      xfs_readsb(xfs_mount_t *mp);
+extern void     xfs_freesb(xfs_mount_t *);
+extern void     xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
+extern int      xfs_syncsub(xfs_mount_t *, int, int, int *);
+extern xfs_agnumber_t   xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
+extern void     xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t);
+extern struct vfsops xfs_vfsops;
+extern struct vnodeops xfs_vnodeops;
+extern struct xfs_dmops xfs_dmcore_stub;
+extern struct xfs_qmops xfs_qmcore_stub;
+extern struct xfs_ioops xfs_iocore_xfs;
+extern int      xfs_init(void);
+extern void     xfs_cleanup(void);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
new file mode 100644
index 000000000000..4f40c92863d5
--- /dev/null
+++ b/fs/xfs/xfs_qmops.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+STATIC struct xfs_dquot *
+xfs_dqvopchown_default(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        struct xfs_dquot        **dqp,
+        struct xfs_dquot        *dq)
+{
+        return NULL;
+}
+xfs_qmops_t     xfs_qmcore_stub = {
+        .xfs_qminit             = (xfs_qminit_t) fs_noerr,
+        .xfs_qmdone             = (xfs_qmdone_t) fs_noerr,
+        .xfs_qmmount            = (xfs_qmmount_t) fs_noerr,
+        .xfs_qmunmount          = (xfs_qmunmount_t) fs_noerr,
+        .xfs_dqrele             = (xfs_dqrele_t) fs_noerr,
+        .xfs_dqattach           = (xfs_dqattach_t) fs_noerr,
+        .xfs_dqdetach           = (xfs_dqdetach_t) fs_noerr,
+        .xfs_dqpurgeall         = (xfs_dqpurgeall_t) fs_noerr,
+        .xfs_dqvopalloc         = (xfs_dqvopalloc_t) fs_noerr,
+        .xfs_dqvopcreate        = (xfs_dqvopcreate_t) fs_noerr,
+        .xfs_dqvoprename        = (xfs_dqvoprename_t) fs_noerr,
+        .xfs_dqvopchown         = xfs_dqvopchown_default,
+        .xfs_dqvopchownresv     = (xfs_dqvopchownresv_t) fs_noerr,
+};
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
new file mode 100644
index 000000000000..703ec4efcb41
--- /dev/null
+++ b/fs/xfs/xfs_quota.h
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QUOTA_H__
+#define __XFS_QUOTA_H__
+/*
+ * The ondisk form of a dquot structure.
+ */
+#define XFS_DQUOT_MAGIC         0x4451          /* 'DQ' */
+#define XFS_DQUOT_VERSION       (u_int8_t)0x01  /* latest version number */
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __int32_t       xfs_dqid_t;
+/*
+ * Eventhough users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t      xfs_qcnt_t;
+typedef __uint16_t      xfs_qwarncnt_t;
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct  xfs_disk_dquot {
+/*16*/  u_int16_t       d_magic;        /* dquot magic = XFS_DQUOT_MAGIC */
+/*8 */  u_int8_t        d_version;      /* dquot version */
+/*8 */  u_int8_t        d_flags;        /* XFS_DQ_USER/PROJ/GROUP */
+/*32*/  xfs_dqid_t      d_id;           /* user,project,group id */
+/*64*/  xfs_qcnt_t      d_blk_hardlimit;/* absolute limit on disk blks */
+/*64*/  xfs_qcnt_t      d_blk_softlimit;/* preferred limit on disk blks */
+/*64*/  xfs_qcnt_t      d_ino_hardlimit;/* maximum # allocated inodes */
+/*64*/  xfs_qcnt_t      d_ino_softlimit;/* preferred inode limit */
+/*64*/  xfs_qcnt_t      d_bcount;       /* disk blocks owned by the user */
+/*64*/  xfs_qcnt_t      d_icount;       /* inodes owned by the user */
+/*32*/  __int32_t       d_itimer;       /* zero if within inode limits if not,
+                                           this is when we refuse service */
+/*32*/  __int32_t       d_btimer;       /* similar to above; for disk blocks */
+/*16*/  xfs_qwarncnt_t  d_iwarns;       /* warnings issued wrt num inodes */
+/*16*/  xfs_qwarncnt_t  d_bwarns;       /* warnings issued wrt disk blocks */
+/*32*/  __int32_t       d_pad0;         /* 64 bit align */
+/*64*/  xfs_qcnt_t      d_rtb_hardlimit;/* absolute limit on realtime blks */
+/*64*/  xfs_qcnt_t      d_rtb_softlimit;/* preferred limit on RT disk blks */
+/*64*/  xfs_qcnt_t      d_rtbcount;     /* realtime blocks owned */
+/*32*/  __int32_t       d_rtbtimer;     /* similar to above; for RT disk blocks */
+/*16*/  xfs_qwarncnt_t  d_rtbwarns;     /* warnings issued wrt RT disk blocks */
+/*16*/  __uint16_t      d_pad;
+} xfs_disk_dquot_t;
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+        xfs_disk_dquot_t  dd_diskdq;    /* portion that lives incore as well */
+        char              dd_fill[32];  /* filling for posterity */
+} xfs_dqblk_t;
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER             0x0001          /* a user quota */
+/* #define XFS_DQ_PROJ          0x0002          -- project quota (IRIX) */
+#define XFS_DQ_GROUP            0x0004          /* a group quota */
+#define XFS_DQ_FLOCKED          0x0008          /* flush lock taken */
+#define XFS_DQ_DIRTY            0x0010          /* dquot is dirty */
+#define XFS_DQ_WANT             0x0020          /* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE         0x0040          /* dq off mplist & hashlist */
+#define XFS_DQ_MARKER           0x0080          /* sentinel */
+/*
+ * In the worst case, when both user and group quotas are on,
+ * we can have a max of three dquots changing in a single transaction.
+ */
+#define XFS_DQUOT_LOGRES(mp)    (sizeof(xfs_disk_dquot_t) * 3)
+/*
+ * These are the structures used to lay out dquots and quotaoff
+ * records on the log. Quite similar to those of inodes.
+ */
+/*
+ * log format struct for dquots.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+        __uint16_t              qlf_type;      /* dquot log item type */
+        __uint16_t              qlf_size;      /* size of this item */
+        xfs_dqid_t              qlf_id;        /* usr/grp id number : 32 bits */
+        __int64_t               qlf_blkno;     /* blkno of dquot buffer */
+        __int32_t               qlf_len;       /* len of dquot buffer */
+        __uint32_t              qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+        unsigned short          qf_type;        /* quotaoff log item type */
+        unsigned short          qf_size;        /* size of this item */
+        unsigned int            qf_flags;       /* USR and/or GRP */
+        char                    qf_pad[12];     /* padding for future */
+} xfs_qoff_logformat_t;
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT 0x0001  /* user quota accounting ON */
+#define XFS_UQUOTA_ENFD 0x0002  /* user quota limits enforced */
+#define XFS_UQUOTA_CHKD 0x0004  /* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT 0x0008  /* (IRIX) project quota accounting ON */
+#define XFS_GQUOTA_ENFD 0x0010  /* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0020  /* quotacheck run on grp quotas */
+#define XFS_GQUOTA_ACCT 0x0040  /* group quota accounting ON */
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE       0x0080  /* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE       0x0100  /* gquotas are being turned off */
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)     ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+                                                   XFS_GQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)    ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)    ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQLOCK        0x0000001 /* dqlock */
+#define XFS_QMOPT_DQALLOC       0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA        0x0000004 /* user dquot requested */
+#define XFS_QMOPT_GQUOTA        0x0000008 /* group dquot requested */
+#define XFS_QMOPT_FORCE_RES     0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_DQSUSER       0x0000020 /* don't cache super users dquot */
+#define XFS_QMOPT_SBVERSION     0x0000040 /* change superblock version num */
+#define XFS_QMOPT_QUOTAOFF      0x0000080 /* quotas are being turned off */
+#define XFS_QMOPT_UMOUNTING     0x0000100 /* filesys is being unmounted */
+#define XFS_QMOPT_DOLOG         0x0000200 /* log buf changes (in quotacheck) */
+#define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if necessary */
+#define XFS_QMOPT_ILOCKED       0x0000800 /* inode is already locked (excl) */
+#define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot, if damaged. */
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS   0x0010000
+#define XFS_QMOPT_RES_RTBLKS    0x0020000
+#define XFS_QMOPT_BCOUNT        0x0040000
+#define XFS_QMOPT_ICOUNT        0x0080000
+#define XFS_QMOPT_RTBCOUNT      0x0100000
+#define XFS_QMOPT_DELBCOUNT     0x0200000
+#define XFS_QMOPT_DELRTBCOUNT   0x0400000
+#define XFS_QMOPT_RES_INOS      0x0800000
+/*
+ * flags for dqflush and dqflush_all.
+ */
+#define XFS_QMOPT_SYNC          0x1000000
+#define XFS_QMOPT_ASYNC         0x2000000
+#define XFS_QMOPT_DELWRI        0x4000000
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT       0x8000000
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS   XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS   XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT     XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT  XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT     XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT   XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+#define XFS_QMOPT_QUOTALL       (XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK   (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+#ifdef __KERNEL__
+/*
+ * This check is done typically without holding the inode lock;
+ * that may seem racey, but it is harmless in the context that it is used.
+ * The inode cannot go inactive as long a reference is kept, and
+ * therefore if dquot(s) were attached, they'll stay consistent.
+ * If, for example, the ownership of the inode changes while
+ * we didn't have the inode locked, the appropriate dquot(s) will be
+ * attached atomically.
+ */
+#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
+                                     (ip)->i_udquot == NULL) || \
+                                    (XFS_IS_GQUOTA_ON(mp) && \
+                                     (ip)->i_gdquot == NULL))
+#define XFS_QM_NEED_QUOTACHECK(mp) ((XFS_IS_UQUOTA_ON(mp) && \
+                                     (mp->m_sb.sb_qflags & \
+                                      XFS_UQUOTA_CHKD) == 0) || \
+                                    (XFS_IS_GQUOTA_ON(mp) && \
+                                     (mp->m_sb.sb_qflags & \
+                                      XFS_GQUOTA_CHKD) == 0))
+#define XFS_MOUNT_QUOTA_ALL     (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+                                 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+                                 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
+#define XFS_MOUNT_QUOTA_MASK    (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \
+                                 XFS_GQUOTA_ACTIVE)
+/*
+ * The structure kept inside the xfs_trans_t keep track of dquot changes
+ * within a transaction and apply them later.
+ */
+typedef struct xfs_dqtrx {
+        struct xfs_dquot *qt_dquot;       /* the dquot this refers to */
+        ulong           qt_blk_res;       /* blks reserved on a dquot */
+        ulong           qt_blk_res_used;  /* blks used from the reservation */
+        ulong           qt_ino_res;       /* inode reserved on a dquot */
+        ulong           qt_ino_res_used;  /* inodes used from the reservation */
+        long            qt_bcount_delta;  /* dquot blk count changes */
+        long            qt_delbcnt_delta; /* delayed dquot blk count changes */
+        long            qt_icount_delta;  /* dquot inode count changes */
+        ulong           qt_rtblk_res;     /* # blks reserved on a dquot */
+        ulong           qt_rtblk_res_used;/* # blks used from reservation */
+        long            qt_rtbcount_delta;/* dquot realtime blk changes */
+        long            qt_delrtb_delta;  /* delayed RT blk count changes */
+} xfs_dqtrx_t;
+/*
+ * Dquot transaction functions, used if quota is enabled.
+ */
+typedef void    (*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *);
+typedef void    (*qo_mod_dquot_byino_t)(struct xfs_trans *,
+                                struct xfs_inode *, uint, long);
+typedef void    (*qo_free_dqinfo_t)(struct xfs_trans *);
+typedef void    (*qo_apply_dquot_deltas_t)(struct xfs_trans *);
+typedef void    (*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *);
+typedef int     (*qo_reserve_quota_nblks_t)(
+                                struct xfs_trans *, struct xfs_mount *,
+                                struct xfs_inode *, long, long, uint);
+typedef int     (*qo_reserve_quota_bydquots_t)(
+                                struct xfs_trans *, struct xfs_mount *,
+                                struct xfs_dquot *, struct xfs_dquot *,
+                                long, long, uint);
+typedef struct xfs_dqtrxops {
+        qo_dup_dqinfo_t                 qo_dup_dqinfo;
+        qo_free_dqinfo_t                qo_free_dqinfo;
+        qo_mod_dquot_byino_t            qo_mod_dquot_byino;
+        qo_apply_dquot_deltas_t         qo_apply_dquot_deltas;
+        qo_reserve_quota_nblks_t        qo_reserve_quota_nblks;
+        qo_reserve_quota_bydquots_t     qo_reserve_quota_bydquots;
+        qo_unreserve_and_mod_dquots_t   qo_unreserve_and_mod_dquots;
+} xfs_dqtrxops_t;
+#define XFS_DQTRXOP(mp, tp, op, args...) \
+                ((mp)->m_qm_ops.xfs_dqtrxops ? \
+                ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : 0)
+#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \
+                ((mp)->m_qm_ops.xfs_dqtrxops ? \
+                ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : (void)0)
+#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \
+        XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp)
+#define XFS_TRANS_FREE_DQINFO(mp, tp) \
+        XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo)
+#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \
+        XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta)
+#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \
+        XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas)
+#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \
+        XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl)
+#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \
+        XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl)
+#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \
+        XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots)
+#define XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, nblks) \
+        XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \
+                                XFS_QMOPT_RES_REGBLKS)
+#define XFS_TRANS_RESERVE_BLKQUOTA_FORCE(mp, tp, ip, nblks) \
+        XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \
+                                XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES)
+#define XFS_TRANS_UNRESERVE_BLKQUOTA(mp, tp, ip, nblks) \
+        XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), 0, \
+                                XFS_QMOPT_RES_REGBLKS)
+#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
+        XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \
+                                f | XFS_QMOPT_RES_REGBLKS)
+#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
+        XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \
+                                f | XFS_QMOPT_RES_REGBLKS)
+extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern struct bhv_vfsops xfs_qmops;
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h
new file mode 100644
index 000000000000..cd8ddfd35d69
--- /dev/null
+++ b/fs/xfs/xfs_refcache.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_REFCACHE_H__
+#define __XFS_REFCACHE_H__
+#ifdef HAVE_REFCACHE
+/*
+ * Maximum size (in inodes) for the NFS reference cache
+ */
+#define XFS_REFCACHE_SIZE_MAX   512
+struct xfs_inode;
+struct xfs_mount;
+extern void xfs_refcache_insert(struct xfs_inode *);
+extern void xfs_refcache_purge_ip(struct xfs_inode *);
+extern void xfs_refcache_purge_mp(struct xfs_mount *);
+extern void xfs_refcache_purge_some(struct xfs_mount *);
+extern void xfs_refcache_resize(int);
+extern void xfs_refcache_destroy(void);
+extern void xfs_refcache_iunlock(struct xfs_inode *, uint);
+#else
+#define xfs_refcache_insert(ip)         do { } while (0)
+#define xfs_refcache_purge_ip(ip)       do { } while (0)
+#define xfs_refcache_purge_mp(mp)       do { } while (0)
+#define xfs_refcache_purge_some(mp)     do { } while (0)
+#define xfs_refcache_resize(size)       do { } while (0)
+#define xfs_refcache_destroy()          do { } while (0)
+#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags)
+#endif
+#endif  /* __XFS_REFCACHE_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
new file mode 100644
index 000000000000..cb13f9a1d45b
--- /dev/null
+++ b/fs/xfs/xfs_rename.c
@@ -0,0 +1,673 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_refcache.h"
+#include "xfs_utils.h"
+#include "xfs_trans_space.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir_leaf.h"
+/*
+ * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
+ * If there are fewer than 4 entries in the array, the empty entries will
+ * be at the end and will have NULL pointers in them.
+ */
+STATIC void
+xfs_rename_unlock4(
+        xfs_inode_t     **i_tab,
+        uint            lock_mode)
+{
+        int     i;
+        xfs_iunlock(i_tab[0], lock_mode);
+        for (i = 1; i < 4; i++) {
+                if (i_tab[i] == NULL) {
+                        break;
+                }
+                /*
+                 * Watch out for duplicate entries in the table.
+                 */
+                if (i_tab[i] != i_tab[i-1]) {
+                        xfs_iunlock(i_tab[i], lock_mode);
+                }
+        }
+}
+#ifdef DEBUG
+int xfs_rename_skip, xfs_rename_nskip;
+#endif
+/*
+ * The following routine will acquire the locks required for a rename
+ * operation. The code understands the semantics of renames and will
+ * validate that name1 exists under dp1 & that name2 may or may not
+ * exist under dp2.
+ *
+ * We are renaming dp1/name1 to dp2/name2.
+ *
+ * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success.
+ */
+STATIC int
+xfs_lock_for_rename(
+        xfs_inode_t     *dp1,   /* old (source) directory inode */
+        xfs_inode_t     *dp2,   /* new (target) directory inode */
+        vname_t         *vname1,/* old entry name */
+        vname_t         *vname2,/* new entry name */
+        xfs_inode_t     **ipp1, /* inode of old entry */
+        xfs_inode_t     **ipp2, /* inode of new entry, if it
+                                   already exists, NULL otherwise. */
+        xfs_inode_t     **i_tab,/* array of inode returned, sorted */
+        int             *num_inodes)  /* number of inodes in array */
+{
+        xfs_inode_t             *ip1, *ip2, *temp;
+        xfs_ino_t               inum1, inum2;
+        int                     error;
+        int                     i, j;
+        uint                    lock_mode;
+        int                     diff_dirs = (dp1 != dp2);
+        ip2 = NULL;
+        /*
+         * First, find out the current inums of the entries so that we
+         * can determine the initial locking order.  We'll have to
+         * sanity check stuff after all the locks have been acquired
+         * to see if we still have the right inodes, directories, etc.
+         */
+        lock_mode = xfs_ilock_map_shared(dp1);
+        error = xfs_get_dir_entry(vname1, &ip1);
+        if (error) {
+                xfs_iunlock_map_shared(dp1, lock_mode);
+                return error;
+        }
+        inum1 = ip1->i_ino;
+        ASSERT(ip1);
+        ITRACE(ip1);
+        /*
+         * Unlock dp1 and lock dp2 if they are different.
+         */
+        if (diff_dirs) {
+                xfs_iunlock_map_shared(dp1, lock_mode);
+                lock_mode = xfs_ilock_map_shared(dp2);
+        }
+        error = xfs_dir_lookup_int(XFS_ITOBHV(dp2), lock_mode,
+                                   vname2, &inum2, &ip2);
+        if (error == ENOENT) {          /* target does not need to exist. */
+                inum2 = 0;
+        } else if (error) {
+                /*
+                 * If dp2 and dp1 are the same, the next line unlocks dp1.
+                 * Got it?
+                 */
+                xfs_iunlock_map_shared(dp2, lock_mode);
+                IRELE (ip1);
+                return error;
+        } else {
+                ITRACE(ip2);
+        }
+        /*
+         * i_tab contains a list of pointers to inodes.  We initialize
+         * the table here & we'll sort it.  We will then use it to
+         * order the acquisition of the inode locks.
+         *
+         * Note that the table may contain duplicates.  e.g., dp1 == dp2.
+         */
+        i_tab[0] = dp1;
+        i_tab[1] = dp2;
+        i_tab[2] = ip1;
+        if (inum2 == 0) {
+                *num_inodes = 3;
+                i_tab[3] = NULL;
+        } else {
+                *num_inodes = 4;
+                i_tab[3] = ip2;
+        }
+        /*
+         * Sort the elements via bubble sort.  (Remember, there are at
+         * most 4 elements to sort, so this is adequate.)
+         */
+        for (i=0; i < *num_inodes; i++) {
+                for (j=1; j < *num_inodes; j++) {
+                        if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+                                temp = i_tab[j];
+                                i_tab[j] = i_tab[j-1];
+                                i_tab[j-1] = temp;
+                        }
+                }
+        }
+        /*
+         * We have dp2 locked. If it isn't first, unlock it.
+         * If it is first, tell xfs_lock_inodes so it can skip it
+         * when locking. if dp1 == dp2, xfs_lock_inodes will skip both
+         * since they are equal. xfs_lock_inodes needs all these inodes
+         * so that it can unlock and retry if there might be a dead-lock
+         * potential with the log.
+         */
+        if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) {
+#ifdef DEBUG
+                xfs_rename_skip++;
+#endif
+                xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED);
+        } else {
+#ifdef DEBUG
+                xfs_rename_nskip++;
+#endif
+                xfs_iunlock_map_shared(dp2, lock_mode);
+                xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
+        }
+        /*
+         * Set the return value. Null out any unused entries in i_tab.
+         */
+        *ipp1 = *ipp2 = NULL;
+        for (i=0; i < *num_inodes; i++) {
+                if (i_tab[i]->i_ino == inum1) {
+                        *ipp1 = i_tab[i];
+                }
+                if (i_tab[i]->i_ino == inum2) {
+                        *ipp2 = i_tab[i];
+                }
+        }
+        for (;i < 4; i++) {
+                i_tab[i] = NULL;
+        }
+        return 0;
+}
+int rename_which_error_return = 0;
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+        bhv_desc_t      *src_dir_bdp,
+        vname_t         *src_vname,
+        vnode_t         *target_dir_vp,
+        vname_t         *target_vname,
+        cred_t          *credp)
+{
+        xfs_trans_t     *tp;
+        xfs_inode_t     *src_dp, *target_dp, *src_ip, *target_ip;
+        xfs_mount_t     *mp;
+        int             new_parent;             /* moving to a new dir */
+        int             src_is_directory;       /* src_name is a directory */
+        int             error;
+        xfs_bmap_free_t free_list;
+        xfs_fsblock_t   first_block;
+        int             cancel_flags;
+        int             committed;
+        xfs_inode_t     *inodes[4];
+        int             target_ip_dropped = 0;  /* dropped target_ip link? */
+        vnode_t         *src_dir_vp;
+        bhv_desc_t      *target_dir_bdp;
+        int             spaceres;
+        int             target_link_zero = 0;
+        int             num_inodes;
+        char            *src_name = VNAME(src_vname);
+        char            *target_name = VNAME(target_vname);
+        int             src_namelen = VNAMELEN(src_vname);
+        int             target_namelen = VNAMELEN(target_vname);
+        src_dir_vp = BHV_TO_VNODE(src_dir_bdp);
+        vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address);
+        vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address);
+        /*
+         * Find the XFS behavior descriptor for the target directory
+         * vnode since it was not handed to us.
+         */
+        target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp),
+                                                &xfs_vnodeops);
+        if (target_dir_bdp == NULL) {
+                return XFS_ERROR(EXDEV);
+        }
+        src_dp = XFS_BHVTOI(src_dir_bdp);
+        target_dp = XFS_BHVTOI(target_dir_bdp);
+        mp = src_dp->i_mount;
+        if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) ||
+            DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
+                                target_dp, DM_EVENT_RENAME)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
+                                        src_dir_vp, DM_RIGHT_NULL,
+                                        target_dir_vp, DM_RIGHT_NULL,
+                                        src_name, target_name,
+                                        0, 0, 0);
+                if (error) {
+                        return error;
+                }
+        }
+        /* Return through std_return after this point. */
+        /*
+         * Lock all the participating inodes. Depending upon whether
+         * the target_name exists in the target directory, and
+         * whether the target directory is the same as the source
+         * directory, we can lock from 2 to 4 inodes.
+         * xfs_lock_for_rename() will return ENOENT if src_name
+         * does not exist in the source directory.
+         */
+        tp = NULL;
+        error = xfs_lock_for_rename(src_dp, target_dp, src_vname,
+                        target_vname, &src_ip, &target_ip, inodes,
+                        &num_inodes);
+        if (error) {
+                rename_which_error_return = __LINE__;
+                /*
+                 * We have nothing locked, no inode references, and
+                 * no transaction, so just get out.
+                 */
+                goto std_return;
+        }
+        ASSERT(src_ip != NULL);
+        if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+                /*
+                 * Check for link count overflow on target_dp
+                 */
+                if (target_ip == NULL && (src_dp != target_dp) &&
+                    target_dp->i_d.di_nlink >= XFS_MAXLINK) {
+                        rename_which_error_return = __LINE__;
+                        error = XFS_ERROR(EMLINK);
+                        xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+                        goto rele_return;
+                }
+        }
+        new_parent = (src_dp != target_dp);
+        src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR);
+        /*
+         * Drop the locks on our inodes so that we can start the transaction.
+         */
+        xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+        XFS_BMAP_INIT(&free_list, &first_block);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+        error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+        if (error == ENOSPC) {
+                spaceres = 0;
+                error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+        }
+        if (error) {
+                rename_which_error_return = __LINE__;
+                xfs_trans_cancel(tp, 0);
+                goto rele_return;
+        }
+        /*
+         * Attach the dquots to the inodes
+         */
+        if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
+                xfs_trans_cancel(tp, cancel_flags);
+                rename_which_error_return = __LINE__;
+                goto rele_return;
+        }
+        /*
+         * Reacquire the inode locks we dropped above.
+         */
+        xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL);
+        /*
+         * Join all the inodes to the transaction. From this point on,
+         * we can rely on either trans_commit or trans_cancel to unlock
+         * them.  Note that we need to add a vnode reference to the
+         * directories since trans_commit & trans_cancel will decrement
+         * them when they unlock the inodes.  Also, we need to be careful
+         * not to add an inode to the transaction more than once.
+         */
+        VN_HOLD(src_dir_vp);
+        xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+        if (new_parent) {
+                VN_HOLD(target_dir_vp);
+                xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+        }
+        if ((src_ip != src_dp) && (src_ip != target_dp)) {
+                xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+        }
+        if ((target_ip != NULL) &&
+            (target_ip != src_ip) &&
+            (target_ip != src_dp) &&
+            (target_ip != target_dp)) {
+                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+        }
+        /*
+         * Set up the target.
+         */
+        if (target_ip == NULL) {
+                /*
+                 * If there's no space reservation, check the entry will
+                 * fit before actually inserting it.
+                 */
+                if (spaceres == 0 &&
+                    (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name,
+                                target_namelen))) {
+                        rename_which_error_return = __LINE__;
+                        goto error_return;
+                }
+                /*
+                 * If target does not exist and the rename crosses
+                 * directories, adjust the target directory link count
+                 * to account for the ".." reference from the new entry.
+                 */
+                error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name,
+                                           target_namelen, src_ip->i_ino,
+                                           &first_block, &free_list, spaceres);
+                if (error == ENOSPC) {
+                        rename_which_error_return = __LINE__;
+                        goto error_return;
+                }
+                if (error) {
+                        rename_which_error_return = __LINE__;
+                        goto abort_return;
+                }
+                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                if (new_parent && src_is_directory) {
+                        error = xfs_bumplink(tp, target_dp);
+                        if (error) {
+                                rename_which_error_return = __LINE__;
+                                goto abort_return;
+                        }
+                }
+        } else { /* target_ip != NULL */
+                /*
+                 * If target exists and it's a directory, check that both
+                 * target and source are directories and that target can be
+                 * destroyed, or that neither is a directory.
+                 */
+                if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+                        /*
+                         * Make sure target dir is empty.
+                         */
+                        if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) ||
+                            (target_ip->i_d.di_nlink > 2)) {
+                                error = XFS_ERROR(EEXIST);
+                                rename_which_error_return = __LINE__;
+                                goto error_return;
+                        }
+                }
+                /*
+                 * Link the source inode under the target name.
+                 * If the source inode is a directory and we are moving
+                 * it across directories, its ".." entry will be
+                 * inconsistent until we replace that down below.
+                 *
+                 * In case there is already an entry with the same
+                 * name at the destination directory, remove it first.
+                 */
+                error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name,
+                        target_namelen, src_ip->i_ino, &first_block,
+                        &free_list, spaceres);
+                if (error) {
+                        rename_which_error_return = __LINE__;
+                        goto abort_return;
+                }
+                xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                /*
+                 * Decrement the link count on the target since the target
+                 * dir no longer points to it.
+                 */
+                error = xfs_droplink(tp, target_ip);
+                if (error) {
+                        rename_which_error_return = __LINE__;
+                        goto abort_return;
+                }
+                target_ip_dropped = 1;
+                if (src_is_directory) {
+                        /*
+                         * Drop the link from the old "." entry.
+                         */
+                        error = xfs_droplink(tp, target_ip);
+                        if (error) {
+                                rename_which_error_return = __LINE__;
+                                goto abort_return;
+                        }
+                }
+                /* Do this test while we still hold the locks */
+                target_link_zero = (target_ip)->i_d.di_nlink==0;
+        } /* target_ip != NULL */
+        /*
+         * Remove the source.
+         */
+        if (new_parent && src_is_directory) {
+                /*
+                 * Rewrite the ".." entry to point to the new
+                 * directory.
+                 */
+                error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2,
+                                        target_dp->i_ino, &first_block,
+                                        &free_list, spaceres);
+                ASSERT(error != EEXIST);
+                if (error) {
+                        rename_which_error_return = __LINE__;
+                        goto abort_return;
+                }
+                xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        } else {
+                /*
+                 * We always want to hit the ctime on the source inode.
+                 * We do it in the if clause above for the 'new_parent &&
+                 * src_is_directory' case, and here we get all the other
+                 * cases.  This isn't strictly required by the standards
+                 * since the source inode isn't really being changed,
+                 * but old unix file systems did it and some incremental
+                 * backup programs won't work without it.
+                 */
+                xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+        }
+        /*
+         * Adjust the link count on src_dp.  This is necessary when
+         * renaming a directory, either within one parent when
+         * the target existed, or across two parent directories.
+         */
+        if (src_is_directory && (new_parent || target_ip != NULL)) {
+                /*
+                 * Decrement link count on src_directory since the
+                 * entry that's moved no longer points to it.
+                 */
+                error = xfs_droplink(tp, src_dp);
+                if (error) {
+                        rename_which_error_return = __LINE__;
+                        goto abort_return;
+                }
+        }
+        error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen,
+                        src_ip->i_ino, &first_block, &free_list, spaceres);
+        if (error) {
+                rename_which_error_return = __LINE__;
+                goto abort_return;
+        }
+        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        /*
+         * Update the generation counts on all the directory inodes
+         * that we're modifying.
+         */
+        src_dp->i_gen++;
+        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+        if (new_parent) {
+                target_dp->i_gen++;
+                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+        }
+        /*
+         * If there was a target inode, take an extra reference on
+         * it here so that it doesn't go to xfs_inactive() from
+         * within the commit.
+         */
+        if (target_ip != NULL) {
+                IHOLD(target_ip);
+        }
+        /*
+         * If this is a synchronous mount, make sure that the
+         * rename transaction goes to disk before returning to
+         * the user.
+         */
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+                xfs_trans_set_sync(tp);
+        }
+        /*
+         * Take refs. for vop_link_removed calls below.  No need to worry
+         * about directory refs. because the caller holds them.
+         *
+         * Do holds before the xfs_bmap_finish since it might rele them down
+         * to zero.
+         */
+        if (target_ip_dropped)
+                IHOLD(target_ip);
+        IHOLD(src_ip);
+        error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+        if (error) {
+                xfs_bmap_cancel(&free_list);
+                xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+                                 XFS_TRANS_ABORT));
+                if (target_ip != NULL) {
+                        IRELE(target_ip);
+                }
+                if (target_ip_dropped) {
+                        IRELE(target_ip);
+                }
+                IRELE(src_ip);
+                goto std_return;
+        }
+        /*
+         * trans_commit will unlock src_ip, target_ip & decrement
+         * the vnode references.
+         */
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        if (target_ip != NULL) {
+                xfs_refcache_purge_ip(target_ip);
+                IRELE(target_ip);
+        }
+        /*
+         * Let interposed file systems know about removed links.
+         */
+        if (target_ip_dropped) {
+                VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp,
+                                        target_link_zero);
+                IRELE(target_ip);
+        }
+        FSC_NOTIFY_NAME_CHANGED(XFS_ITOV(src_ip));
+        IRELE(src_ip);
+        /* Fall through to std_return with error = 0 or errno from
+         * xfs_trans_commit      */
+std_return:
+        if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) ||
+            DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
+                                target_dp, DM_EVENT_POSTRENAME)) {
+                (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
+                                        src_dir_vp, DM_RIGHT_NULL,
+                                        target_dir_vp, DM_RIGHT_NULL,
+                                        src_name, target_name,
+                                        0, error, 0);
+        }
+        return error;
+ abort_return:
+        cancel_flags |= XFS_TRANS_ABORT;
+        /* FALLTHROUGH */
+ error_return:
+        xfs_bmap_cancel(&free_list);
+        xfs_trans_cancel(tp, cancel_flags);
+        goto std_return;
+ rele_return:
+        IRELE(src_ip);
+        if (target_ip != NULL) {
+                IRELE(target_ip);
+        }
+        goto std_return;
+}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
new file mode 100644
index 000000000000..2c37822d1012
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.c
@@ -0,0 +1,2469 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * Free realtime space allocation for XFS.
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_fsops.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_inode_item.h"
+#include "xfs_trans_space.h"
+/*
+ * Prototypes for internal functions.
+ */
+STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+                xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
+STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
+                xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
+STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+                xfs_extlen_t, int, xfs_rtblock_t *, int *);
+STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+                xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+                xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
+                xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
+STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+                xfs_extlen_t, int);
+STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
+                xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
+/*
+ * Internal functions.
+ */
+/*
+ * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
+ */
+STATIC int
+xfs_lowbit32(
+        __uint32_t      v)
+{
+        if (v)
+                return ffs(v) - 1;
+        return -1;
+}
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ */
+STATIC int                              /* error */
+xfs_growfs_rt_alloc(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_extlen_t    oblocks,        /* old count of blocks */
+        xfs_extlen_t    nblocks,        /* new count of blocks */
+        xfs_ino_t       ino)            /* inode number (bitmap/summary) */
+{
+        xfs_fileoff_t   bno;            /* block number in file */
+        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
+        int             cancelflags;    /* flags for xfs_trans_cancel */
+        int             committed;      /* transaction committed flag */
+        xfs_daddr_t     d;              /* disk block address */
+        int             error;          /* error return value */
+        xfs_fsblock_t   firstblock;     /* first block allocated in xaction */
+        xfs_bmap_free_t flist;          /* list of freed blocks */
+        xfs_fsblock_t   fsbno;          /* filesystem block for bno */
+        xfs_inode_t     *ip;            /* pointer to incore inode */
+        xfs_bmbt_irec_t map;            /* block map output */
+        int             nmap;           /* number of block maps */
+        int             resblks;        /* space reservation */
+        xfs_trans_t     *tp;            /* transaction pointer */
+        /*
+         * Allocate space to the file, as necessary.
+         */
+        while (oblocks < nblocks) {
+                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+                cancelflags = 0;
+                /*
+                 * Reserve space & log for one extent added to the file.
+                 */
+                if ((error = xfs_trans_reserve(tp, resblks,
+                                XFS_GROWRTALLOC_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES,
+                                XFS_DEFAULT_PERM_LOG_COUNT)))
+                        goto error_exit;
+                cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+                /*
+                 * Lock the inode.
+                 */
+                if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip)))
+                        goto error_exit;
+                XFS_BMAP_INIT(&flist, &firstblock);
+                /*
+                 * Allocate blocks to the bitmap file.
+                 */
+                nmap = 1;
+                cancelflags |= XFS_TRANS_ABORT;
+                error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
+                        XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
+                        resblks, &map, &nmap, &flist);
+                if (!error && nmap < 1)
+                        error = XFS_ERROR(ENOSPC);
+                if (error)
+                        goto error_exit;
+                /*
+                 * Free any blocks freed up in the transaction, then commit.
+                 */
+                error = xfs_bmap_finish(&tp, &flist, firstblock, &committed);
+                if (error)
+                        goto error_exit;
+                xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+                /*
+                 * Now we need to clear the allocated blocks.
+                 * Do this one block per transaction, to keep it simple.
+                 */
+                cancelflags = 0;
+                for (bno = map.br_startoff, fsbno = map.br_startblock;
+                     bno < map.br_startoff + map.br_blockcount;
+                     bno++, fsbno++) {
+                        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
+                        /*
+                         * Reserve log for one block zeroing.
+                         */
+                        if ((error = xfs_trans_reserve(tp, 0,
+                                        XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
+                                goto error_exit;
+                        /*
+                         * Lock the bitmap inode.
+                         */
+                        if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL,
+                                        &ip)))
+                                goto error_exit;
+                        /*
+                         * Get a buffer for the block.
+                         */
+                        d = XFS_FSB_TO_DADDR(mp, fsbno);
+                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+                                mp->m_bsize, 0);
+                        if (bp == NULL) {
+                                error = XFS_ERROR(EIO);
+                                goto error_exit;
+                        }
+                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
+                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+                        /*
+                         * Commit the transaction.
+                         */
+                        xfs_trans_commit(tp, 0, NULL);
+                }
+                /*
+                 * Go on to the next extent, if any.
+                 */
+                oblocks = map.br_startoff + map.br_blockcount;
+        }
+        return 0;
+error_exit:
+        xfs_trans_cancel(tp, cancelflags);
+        return error;
+}
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno.  If we don't get maxlen then use prod to trim
+ * the length, if given.  Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int                              /* error */
+xfs_rtallocate_extent_block(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bbno,           /* bitmap block number */
+        xfs_extlen_t    minlen,         /* minimum length to allocate */
+        xfs_extlen_t    maxlen,         /* maximum length to allocate */
+        xfs_extlen_t    *len,           /* out: actual length allocated */
+        xfs_rtblock_t   *nextp,         /* out: next block to try */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
+        xfs_extlen_t    prod,           /* extent product factor */
+        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
+{
+        xfs_rtblock_t   besti;          /* best rtblock found so far */
+        xfs_rtblock_t   bestlen;        /* best length found so far */
+        xfs_rtblock_t   end;            /* last rtblock in chunk */
+        int             error;          /* error value */
+        xfs_rtblock_t   i;              /* current rtblock trying */
+        xfs_rtblock_t   next;           /* next rtblock to try */
+        int             stat;           /* status from internal calls */
+        /*
+         * Loop over all the extents starting in this bitmap block,
+         * looking for one that's long enough.
+         */
+        for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+                end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+             i <= end;
+             i++) {
+                /*
+                 * See if there's a free extent of maxlen starting at i.
+                 * If it's not so then next will contain the first non-free.
+                 */
+                error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+                if (error) {
+                        return error;
+                }
+                if (stat) {
+                        /*
+                         * i for maxlen is all free, allocate and return that.
+                         */
+                        error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+                                rsb);
+                        if (error) {
+                                return error;
+                        }
+                        *len = maxlen;
+                        *rtblock = i;
+                        return 0;
+                }
+                /*
+                 * In the case where we have a variable-sized allocation
+                 * request, figure out how big this free piece is,
+                 * and if it's big enough for the minimum, and the best
+                 * so far, remember it.
+                 */
+                if (minlen < maxlen) {
+                        xfs_rtblock_t   thislen;        /* this extent size */
+                        thislen = next - i;
+                        if (thislen >= minlen && thislen > bestlen) {
+                                besti = i;
+                                bestlen = thislen;
+                        }
+                }
+                /*
+                 * If not done yet, find the start of the next free space.
+                 */
+                if (next < end) {
+                        error = xfs_rtfind_forw(mp, tp, next, end, &i);
+                        if (error) {
+                                return error;
+                        }
+                } else
+                        break;
+        }
+        /*
+         * Searched the whole thing & didn't find a maxlen free extent.
+         */
+        if (minlen < maxlen && besti != -1) {
+                xfs_extlen_t    p;      /* amount to trim length by */
+                /*
+                 * If size should be a multiple of prod, make that so.
+                 */
+                if (prod > 1 && (p = do_mod(bestlen, prod)))
+                        bestlen -= p;
+                /*
+                 * Allocate besti for bestlen & return that.
+                 */
+                error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+                *len = bestlen;
+                *rtblock = besti;
+                return 0;
+        }
+        /*
+         * Allocation failed.  Set *nextp to the next block to try.
+         */
+        *nextp = next;
+        *rtblock = NULLRTBLOCK;
+        return 0;
+}
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting at block
+ * bno.  If we don't get maxlen then use prod to trim the length, if given.
+ * Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int                              /* error */
+xfs_rtallocate_extent_exact(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number to allocate */
+        xfs_extlen_t    minlen,         /* minimum length to allocate */
+        xfs_extlen_t    maxlen,         /* maximum length to allocate */
+        xfs_extlen_t    *len,           /* out: actual length allocated */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
+        xfs_extlen_t    prod,           /* extent product factor */
+        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
+{
+        int             error;          /* error value */
+        xfs_extlen_t    i;              /* extent length trimmed due to prod */
+        int             isfree;         /* extent is free */
+        xfs_rtblock_t   next;           /* next block to try (dummy) */
+        ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+        /*
+         * Check if the range in question (for maxlen) is free.
+         */
+        error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+        if (error) {
+                return error;
+        }
+        if (isfree) {
+                /*
+                 * If it is, allocate it and return success.
+                 */
+                error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+                *len = maxlen;
+                *rtblock = bno;
+                return 0;
+        }
+        /*
+         * If not, allocate what there is, if it's at least minlen.
+         */
+        maxlen = next - bno;
+        if (maxlen < minlen) {
+                /*
+                 * Failed, return failure status.
+                 */
+                *rtblock = NULLRTBLOCK;
+                return 0;
+        }
+        /*
+         * Trim off tail of extent, if prod is specified.
+         */
+        if (prod > 1 && (i = maxlen % prod)) {
+                maxlen -= i;
+                if (maxlen < minlen) {
+                        /*
+                         * Now we can't do it, return failure status.
+                         */
+                        *rtblock = NULLRTBLOCK;
+                        return 0;
+                }
+        }
+        /*
+         * Allocate what we can and return it.
+         */
+        error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+        if (error) {
+                return error;
+        }
+        *len = maxlen;
+        *rtblock = bno;
+        return 0;
+}
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting as near
+ * to bno as possible.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int                              /* error */
+xfs_rtallocate_extent_near(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number to allocate */
+        xfs_extlen_t    minlen,         /* minimum length to allocate */
+        xfs_extlen_t    maxlen,         /* maximum length to allocate */
+        xfs_extlen_t    *len,           /* out: actual length allocated */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
+        xfs_extlen_t    prod,           /* extent product factor */
+        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
+{
+        int             any;            /* any useful extents from summary */
+        xfs_rtblock_t   bbno;           /* bitmap block number */
+        int             error;          /* error value */
+        int             i;              /* bitmap block offset (loop control) */
+        int             j;              /* secondary loop control */
+        int             log2len;        /* log2 of minlen */
+        xfs_rtblock_t   n;              /* next block to try */
+        xfs_rtblock_t   r;              /* result block */
+        ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+        /*
+         * If the block number given is off the end, silently set it to
+         * the last block.
+         */
+        if (bno >= mp->m_sb.sb_rextents)
+                bno = mp->m_sb.sb_rextents - 1;
+        /*
+         * Try the exact allocation first.
+         */
+        error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
+                rbpp, rsb, prod, &r);
+        if (error) {
+                return error;
+        }
+        /*
+         * If the exact allocation worked, return that.
+         */
+        if (r != NULLRTBLOCK) {
+                *rtblock = r;
+                return 0;
+        }
+        bbno = XFS_BITTOBLOCK(mp, bno);
+        i = 0;
+        log2len = xfs_highbit32(minlen);
+        /*
+         * Loop over all bitmap blocks (bbno + i is current block).
+         */
+        for (;;) {
+                /*
+                 * Get summary information of extents of all useful levels
+                 * starting in this bitmap block.
+                 */
+                error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
+                        bbno + i, rbpp, rsb, &any);
+                if (error) {
+                        return error;
+                }
+                /*
+                 * If there are any useful extents starting here, try
+                 * allocating one.
+                 */
+                if (any) {
+                        /*
+                         * On the positive side of the starting location.
+                         */
+                        if (i >= 0) {
+                                /*
+                                 * Try to allocate an extent starting in
+                                 * this block.
+                                 */
+                                error = xfs_rtallocate_extent_block(mp, tp,
+                                        bbno + i, minlen, maxlen, len, &n, rbpp,
+                                        rsb, prod, &r);
+                                if (error) {
+                                        return error;
+                                }
+                                /*
+                                 * If it worked, return it.
+                                 */
+                                if (r != NULLRTBLOCK) {
+                                        *rtblock = r;
+                                        return 0;
+                                }
+                        }
+                        /*
+                         * On the negative side of the starting location.
+                         */
+                        else {          /* i < 0 */
+                                /*
+                                 * Loop backwards through the bitmap blocks from
+                                 * the starting point-1 up to where we are now.
+                                 * There should be an extent which ends in this
+                                 * bitmap block and is long enough.
+                                 */
+                                for (j = -1; j > i; j--) {
+                                        /*
+                                         * Grab the summary information for
+                                         * this bitmap block.
+                                         */
+                                        error = xfs_rtany_summary(mp, tp,
+                                                log2len, mp->m_rsumlevels - 1,
+                                                bbno + j, rbpp, rsb, &any);
+                                        if (error) {
+                                                return error;
+                                        }
+                                        /*
+                                         * If there's no extent given in the
+                                         * summary that means the extent we
+                                         * found must carry over from an
+                                         * earlier block.  If there is an
+                                         * extent given, we've already tried
+                                         * that allocation, don't do it again.
+                                         */
+                                        if (any)
+                                                continue;
+                                        error = xfs_rtallocate_extent_block(mp,
+                                                tp, bbno + j, minlen, maxlen,
+                                                len, &n, rbpp, rsb, prod, &r);
+                                        if (error) {
+                                                return error;
+                                        }
+                                        /*
+                                         * If it works, return the extent.
+                                         */
+                                        if (r != NULLRTBLOCK) {
+                                                *rtblock = r;
+                                                return 0;
+                                        }
+                                }
+                                /*
+                                 * There weren't intervening bitmap blocks
+                                 * with a long enough extent, or the
+                                 * allocation didn't work for some reason
+                                 * (i.e. it's a little * too short).
+                                 * Try to allocate from the summary block
+                                 * that we found.
+                                 */
+                                error = xfs_rtallocate_extent_block(mp, tp,
+                                        bbno + i, minlen, maxlen, len, &n, rbpp,
+                                        rsb, prod, &r);
+                                if (error) {
+                                        return error;
+                                }
+                                /*
+                                 * If it works, return the extent.
+                                 */
+                                if (r != NULLRTBLOCK) {
+                                        *rtblock = r;
+                                        return 0;
+                                }
+                        }
+                }
+                /*
+                 * Loop control.  If we were on the positive side, and there's
+                 * still more blocks on the negative side, go there.
+                 */
+                if (i > 0 && (int)bbno - i >= 0)
+                        i = -i;
+                /*
+                 * If positive, and no more negative, but there are more
+                 * positive, go there.
+                 */
+                else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
+                        i++;
+                /*
+                 * If negative or 0 (just started), and there are positive
+                 * blocks to go, go there.  The 0 case moves to block 1.
+                 */
+                else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
+                        i = 1 - i;
+                /*
+                 * If negative or 0 and there are more negative blocks,
+                 * go there.
+                 */
+                else if (i <= 0 && (int)bbno + i > 0)
+                        i--;
+                /*
+                 * Must be done.  Return failure.
+                 */
+                else
+                        break;
+        }
+        *rtblock = NULLRTBLOCK;
+        return 0;
+}
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, with no position
+ * specified.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int                              /* error */
+xfs_rtallocate_extent_size(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_extlen_t    minlen,         /* minimum length to allocate */
+        xfs_extlen_t    maxlen,         /* maximum length to allocate */
+        xfs_extlen_t    *len,           /* out: actual length allocated */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
+        xfs_extlen_t    prod,           /* extent product factor */
+        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
+{
+        int             error;          /* error value */
+        int             i;              /* bitmap block number */
+        int             l;              /* level number (loop control) */
+        xfs_rtblock_t   n;              /* next block to be tried */
+        xfs_rtblock_t   r;              /* result block number */
+        xfs_suminfo_t   sum;            /* summary information for extents */
+        ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+        /*
+         * Loop over all the levels starting with maxlen.
+         * At each level, look at all the bitmap blocks, to see if there
+         * are extents starting there that are long enough (>= maxlen).
+         * Note, only on the initial level can the allocation fail if
+         * the summary says there's an extent.
+         */
+        for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
+                /*
+                 * Loop over all the bitmap blocks.
+                 */
+                for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+                        /*
+                         * Get the summary for this level/block.
+                         */
+                        error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+                                &sum);
+                        if (error) {
+                                return error;
+                        }
+                        /*
+                         * Nothing there, on to the next block.
+                         */
+                        if (!sum)
+                                continue;
+                        /*
+                         * Try allocating the extent.
+                         */
+                        error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
+                                maxlen, len, &n, rbpp, rsb, prod, &r);
+                        if (error) {
+                                return error;
+                        }
+                        /*
+                         * If it worked, return that.
+                         */
+                        if (r != NULLRTBLOCK) {
+                                *rtblock = r;
+                                return 0;
+                        }
+                        /*
+                         * If the "next block to try" returned from the
+                         * allocator is beyond the next bitmap block,
+                         * skip to that bitmap block.
+                         */
+                        if (XFS_BITTOBLOCK(mp, n) > i + 1)
+                                i = XFS_BITTOBLOCK(mp, n) - 1;
+                }
+        }
+        /*
+         * Didn't find any maxlen blocks.  Try smaller ones, unless
+         * we're asking for a fixed size extent.
+         */
+        if (minlen > --maxlen) {
+                *rtblock = NULLRTBLOCK;
+                return 0;
+        }
+        /*
+         * Loop over sizes, from maxlen down to minlen.
+         * This time, when we do the allocations, allow smaller ones
+         * to succeed.
+         */
+        for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
+                /*
+                 * Loop over all the bitmap blocks, try an allocation
+                 * starting in that block.
+                 */
+                for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+                        /*
+                         * Get the summary information for this level/block.
+                         */
+                        error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+                                                  &sum);
+                        if (error) {
+                                return error;
+                        }
+                        /*
+                         * If nothing there, go on to next.
+                         */
+                        if (!sum)
+                                continue;
+                        /*
+                         * Try the allocation.  Make sure the specified
+                         * minlen/maxlen are in the possible range for
+                         * this summary level.
+                         */
+                        error = xfs_rtallocate_extent_block(mp, tp, i,
+                                        XFS_RTMAX(minlen, 1 << l),
+                                        XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+                                        len, &n, rbpp, rsb, prod, &r);
+                        if (error) {
+                                return error;
+                        }
+                        /*
+                         * If it worked, return that extent.
+                         */
+                        if (r != NULLRTBLOCK) {
+                                *rtblock = r;
+                                return 0;
+                        }
+                        /*
+                         * If the "next block to try" returned from the
+                         * allocator is beyond the next bitmap block,
+                         * skip to that bitmap block.
+                         */
+                        if (XFS_BITTOBLOCK(mp, n) > i + 1)
+                                i = XFS_BITTOBLOCK(mp, n) - 1;
+                }
+        }
+        /*
+         * Got nothing, return failure.
+         */
+        *rtblock = NULLRTBLOCK;
+        return 0;
+}
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int                              /* error */
+xfs_rtallocate_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* start block to allocate */
+        xfs_extlen_t    len,            /* length to allocate */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+        xfs_rtblock_t   end;            /* end of the allocated extent */
+        int             error;          /* error value */
+        xfs_rtblock_t   postblock;      /* first block allocated > end */
+        xfs_rtblock_t   preblock;       /* first block allocated < start */
+        end = start + len - 1;
+        /*
+         * Assume we're allocating out of the middle of a free extent.
+         * We need to find the beginning and end of the extent so we can
+         * properly update the summary.
+         */
+        error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+        if (error) {
+                return error;
+        }
+        /*
+         * Find the next allocated block (end of free extent).
+         */
+        error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+                &postblock);
+        if (error) {
+                return error;
+        }
+        /*
+         * Decrement the summary information corresponding to the entire
+         * (old) free extent.
+         */
+        error = xfs_rtmodify_summary(mp, tp,
+                XFS_RTBLOCKLOG(postblock + 1 - preblock),
+                XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+        if (error) {
+                return error;
+        }
+        /*
+         * If there are blocks not being allocated at the front of the
+         * old extent, add summary data for them to be free.
+         */
+        if (preblock < start) {
+                error = xfs_rtmodify_summary(mp, tp,
+                        XFS_RTBLOCKLOG(start - preblock),
+                        XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+        }
+        /*
+         * If there are blocks not being allocated at the end of the
+         * old extent, add summary data for them to be free.
+         */
+        if (postblock > end) {
+                error = xfs_rtmodify_summary(mp, tp,
+                        XFS_RTBLOCKLOG(postblock - end),
+                        XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+        }
+        /*
+         * Modify the bitmap to mark this extent allocated.
+         */
+        error = xfs_rtmodify_range(mp, tp, start, len, 0);
+        return error;
+}
+/*
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
+ */
+STATIC int                              /* error */
+xfs_rtany_summary(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        int             low,            /* low log2 extent size */
+        int             high,           /* high log2 extent size */
+        xfs_rtblock_t   bbno,           /* bitmap block number */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
+        int             *stat)          /* out: any good extents here? */
+{
+        int             error;          /* error value */
+        int             log;            /* loop counter, log2 of ext. size */
+        xfs_suminfo_t   sum;            /* summary data */
+        /*
+         * Loop over logs of extent sizes.  Order is irrelevant.
+         */
+        for (log = low; log <= high; log++) {
+                /*
+                 * Get one summary datum.
+                 */
+                error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+                if (error) {
+                        return error;
+                }
+                /*
+                 * If there are any, return success.
+                 */
+                if (sum) {
+                        *stat = 1;
+                        return 0;
+                }
+        }
+        /*
+         * Found nothing, return failure.
+         */
+        *stat = 0;
+        return 0;
+}
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+STATIC int                              /* error */
+xfs_rtbuf_get(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   block,          /* block number in bitmap or summary */
+        int             issum,          /* is summary not bitmap */
+        xfs_buf_t       **bpp)          /* output: buffer for the block */
+{
+        xfs_buf_t       *bp;            /* block buffer, result */
+        xfs_daddr_t     d;              /* disk addr of block */
+        int             error;          /* error value */
+        xfs_fsblock_t   fsb;            /* fs block number for block */
+        xfs_inode_t     *ip;            /* bitmap or summary inode */
+        ip = issum ? mp->m_rsumip : mp->m_rbmip;
+        /*
+         * Map from the file offset (block) and inode number to the
+         * file system block.
+         */
+        error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
+        if (error) {
+                return error;
+        }
+        ASSERT(fsb != NULLFSBLOCK);
+        /*
+         * Convert to disk address for buffer cache.
+         */
+        d = XFS_FSB_TO_DADDR(mp, fsb);
+        /*
+         * Read the buffer.
+         */
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+                                   mp->m_bsize, 0, &bp);
+        if (error) {
+                return error;
+        }
+        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+        *bpp = bp;
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int                              /* error */
+xfs_rtcheck_alloc_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number of extent */
+        xfs_extlen_t    len,            /* length of extent */
+        int             *stat)          /* out: 1 for allocated, 0 for not */
+{
+        xfs_rtblock_t   new;            /* dummy for xfs_rtcheck_range */
+        return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
+}
+#endif
+#ifdef DEBUG
+/*
+ * Check whether the given block in the bitmap has the given value.
+ */
+STATIC int                              /* 1 for matches, 0 for not */
+xfs_rtcheck_bit(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* bit (block) to check */
+        int             val)            /* 1 for free, 0 for allocated */
+{
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* pointer into the buffer */
+        /* REFERENCED */
+        int             error;          /* error value */
+        xfs_rtword_t    wdiff;          /* difference between bit & expected */
+        int             word;           /* word number in the buffer */
+        xfs_rtword_t    wval;           /* word value from buffer */
+        block = XFS_BITTOBLOCK(mp, start);
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        word = XFS_BITTOWORD(mp, start);
+        bit = (int)(start & (XFS_NBWORD - 1));
+        wval = bufp[word];
+        xfs_trans_brelse(tp, bp);
+        wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit);
+        return !wdiff;
+}
+#endif  /* DEBUG */
+#if 0
+/*
+ * Check that the given extent (block range) is free already.
+ */
+STATIC int                              /* error */
+xfs_rtcheck_free_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number of extent */
+        xfs_extlen_t    len,            /* length of extent */
+        int             *stat)          /* out: 1 for free, 0 for not */
+{
+        xfs_rtblock_t   new;            /* dummy for xfs_rtcheck_range */
+        return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat);
+}
+#endif
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+STATIC int                              /* error */
+xfs_rtcheck_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block number of extent */
+        xfs_extlen_t    len,            /* length of extent */
+        int             val,            /* 1 for free, 0 for allocated */
+        xfs_rtblock_t   *new,           /* out: first block not matching */
+        int             *stat)          /* out: 1 for matches, 0 for not */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtblock_t   i;              /* current bit number rel. to start */
+        xfs_rtblock_t   lastbit;        /* last useful bit in word */
+        xfs_rtword_t    mask;           /* mask of relevant bits for value */
+        xfs_rtword_t    wdiff;          /* difference from wanted value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute starting bitmap block number
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        /*
+         * Read the bitmap block.
+         */
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        /*
+         * Compute the starting word's address, and starting bit.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        /*
+         * 0 (allocated) => all zero's; 1 (free) => all one's.
+         */
+        val = -val;
+        /*
+         * If not starting on a word boundary, deal with the first
+         * (partial) word.
+         */
+        if (bit) {
+                /*
+                 * Compute first bit not examined.
+                 */
+                lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+                /*
+                 * Mask of relevant bits.
+                 */
+                mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ val) & mask)) {
+                        /*
+                         * Different, compute first wrong bit and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i = XFS_RTLOBIT(wdiff) - bit;
+                        *new = start + i;
+                        *stat = 0;
+                        return 0;
+                }
+                i = lastbit - bit;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the next one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer.
+                         */
+                        b++;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the next one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = *b ^ val)) {
+                        /*
+                         * Different, compute first wrong bit and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *new = start + i;
+                        *stat = 0;
+                        return 0;
+                }
+                i += XFS_NBWORD;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the next one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer.
+                         */
+                        b++;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if ((lastbit = len - i)) {
+                /*
+                 * Mask of relevant bits.
+                 */
+                mask = ((xfs_rtword_t)1 << lastbit) - 1;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ val) & mask)) {
+                        /*
+                         * Different, compute first wrong bit and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *new = start + i;
+                        *stat = 0;
+                        return 0;
+                } else
+                        i = len;
+        }
+        /*
+         * Successful, return.
+         */
+        xfs_trans_brelse(tp, bp);
+        *new = start + i;
+        *stat = 1;
+        return 0;
+}
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int                              /* error */
+xfs_rtcopy_summary(
+        xfs_mount_t     *omp,           /* old file system mount point */
+        xfs_mount_t     *nmp,           /* new file system mount point */
+        xfs_trans_t     *tp)            /* transaction pointer */
+{
+        xfs_rtblock_t   bbno;           /* bitmap block number */
+        xfs_buf_t       *bp;            /* summary buffer */
+        int             error;          /* error return value */
+        int             log;            /* summary level number (log length) */
+        xfs_suminfo_t   sum;            /* summary data */
+        xfs_fsblock_t   sumbno;         /* summary block number */
+        bp = NULL;
+        for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+                for (bbno = omp->m_sb.sb_rbmblocks - 1;
+                     (xfs_srtblock_t)bbno >= 0;
+                     bbno--) {
+                        error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+                                &sumbno, &sum);
+                        if (error)
+                                return error;
+                        if (sum == 0)
+                                continue;
+                        error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+                                &bp, &sumbno);
+                        if (error)
+                                return error;
+                        error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+                                &bp, &sumbno);
+                        if (error)
+                                return error;
+                        ASSERT(sum > 0);
+                }
+        }
+        return 0;
+}
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int                              /* error */
+xfs_rtfind_back(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to look at */
+        xfs_rtblock_t   limit,          /* last block to look at */
+        xfs_rtblock_t   *rtblock)       /* out: start block found */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtblock_t   firstbit;       /* first useful bit in the word */
+        xfs_rtblock_t   i;              /* current bit number rel. to start */
+        xfs_rtblock_t   len;            /* length of inspected area */
+        xfs_rtword_t    mask;           /* mask of relevant bits for value */
+        xfs_rtword_t    want;           /* mask for "good" values */
+        xfs_rtword_t    wdiff;          /* difference from wanted value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute and read in starting bitmap block for starting block.
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        /*
+         * Get the first word's index & point to it.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        len = start - limit + 1;
+        /*
+         * Compute match value, based on the bit at start: if 1 (free)
+         * then all-ones, else all-zeroes.
+         */
+        want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+        /*
+         * If the starting position is not word-aligned, deal with the
+         * partial word.
+         */
+        if (bit < XFS_NBWORD - 1) {
+                /*
+                 * Calculate first (leftmost) bit number to look at,
+                 * and mask for all the relevant bits in this word.
+                 */
+                firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+                mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+                        firstbit;
+                /*
+                 * Calculate the difference between the value there
+                 * and what we're looking for.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different.  Mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i = bit - XFS_RTHIBIT(wdiff);
+                        *rtblock = start - i + 1;
+                        return 0;
+                }
+                i = bit - firstbit + 1;
+                /*
+                 * Go on to previous block if that's where the previous word is
+                 * and we need the previous word.
+                 */
+                if (--word == -1 && i < len) {
+                        /*
+                         * If done with this block, get the previous one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = XFS_BLOCKWMASK(mp);
+                        b = &bufp[word];
+                } else {
+                        /*
+                         * Go on to the previous word in the buffer.
+                         */
+                        b--;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the previous one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = *b ^ want)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+                        *rtblock = start - i + 1;
+                        return 0;
+                }
+                i += XFS_NBWORD;
+                /*
+                 * Go on to previous block if that's where the previous word is
+                 * and we need the previous word.
+                 */
+                if (--word == -1 && i < len) {
+                        /*
+                         * If done with this block, get the previous one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = XFS_BLOCKWMASK(mp);
+                        b = &bufp[word];
+                } else {
+                        /*
+                         * Go on to the previous word in the buffer.
+                         */
+                        b--;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if (len - i) {
+                /*
+                 * Calculate first (leftmost) bit number to look at,
+                 * and mask for all the relevant bits in this word.
+                 */
+                firstbit = XFS_NBWORD - (len - i);
+                mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+                        *rtblock = start - i + 1;
+                        return 0;
+                } else
+                        i = len;
+        }
+        /*
+         * No match, return that we scanned the whole area.
+         */
+        xfs_trans_brelse(tp, bp);
+        *rtblock = start - i + 1;
+        return 0;
+}
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int                              /* error */
+xfs_rtfind_forw(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to look at */
+        xfs_rtblock_t   limit,          /* last block to look at */
+        xfs_rtblock_t   *rtblock)       /* out: start block found */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtblock_t   i;              /* current bit number rel. to start */
+        xfs_rtblock_t   lastbit;        /* last useful bit in the word */
+        xfs_rtblock_t   len;            /* length of inspected area */
+        xfs_rtword_t    mask;           /* mask of relevant bits for value */
+        xfs_rtword_t    want;           /* mask for "good" values */
+        xfs_rtword_t    wdiff;          /* difference from wanted value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute and read in starting bitmap block for starting block.
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        /*
+         * Get the first word's index & point to it.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        len = limit - start + 1;
+        /*
+         * Compute match value, based on the bit at start: if 1 (free)
+         * then all-ones, else all-zeroes.
+         */
+        want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+        /*
+         * If the starting position is not word-aligned, deal with the
+         * partial word.
+         */
+        if (bit) {
+                /*
+                 * Calculate last (rightmost) bit number to look at,
+                 * and mask for all the relevant bits in this word.
+                 */
+                lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+                mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+                /*
+                 * Calculate the difference between the value there
+                 * and what we're looking for.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different.  Mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i = XFS_RTLOBIT(wdiff) - bit;
+                        *rtblock = start + i - 1;
+                        return 0;
+                }
+                i = lastbit - bit;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the previous one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the previous word in the buffer.
+                         */
+                        b++;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the next one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = *b ^ want)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *rtblock = start + i - 1;
+                        return 0;
+                }
+                i += XFS_NBWORD;
+                /*
+                 * Go on to next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * If done with this block, get the next one.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer.
+                         */
+                        b++;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if ((lastbit = len - i)) {
+                /*
+                 * Calculate mask for all the relevant bits in this word.
+                 */
+                mask = ((xfs_rtword_t)1 << lastbit) - 1;
+                /*
+                 * Compute difference between actual and desired value.
+                 */
+                if ((wdiff = (*b ^ want) & mask)) {
+                        /*
+                         * Different, mark where we are and return.
+                         */
+                        xfs_trans_brelse(tp, bp);
+                        i += XFS_RTLOBIT(wdiff);
+                        *rtblock = start + i - 1;
+                        return 0;
+                } else
+                        i = len;
+        }
+        /*
+         * No match, return that we scanned the whole area.
+         */
+        xfs_trans_brelse(tp, bp);
+        *rtblock = start + i - 1;
+        return 0;
+}
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int                              /* error */
+xfs_rtfree_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to free */
+        xfs_extlen_t    len,            /* length to free */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+        xfs_rtblock_t   end;            /* end of the freed extent */
+        int             error;          /* error value */
+        xfs_rtblock_t   postblock;      /* first block freed > end */
+        xfs_rtblock_t   preblock;       /* first block freed < start */
+        end = start + len - 1;
+        /*
+         * Modify the bitmap to mark this extent freed.
+         */
+        error = xfs_rtmodify_range(mp, tp, start, len, 1);
+        if (error) {
+                return error;
+        }
+        /*
+         * Assume we're freeing out of the middle of an allocated extent.
+         * We need to find the beginning and end of the extent so we can
+         * properly update the summary.
+         */
+        error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+        if (error) {
+                return error;
+        }
+        /*
+         * Find the next allocated block (end of allocated extent).
+         */
+        error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+                &postblock);
+        /*
+         * If there are blocks not being freed at the front of the
+         * old extent, add summary data for them to be allocated.
+         */
+        if (preblock < start) {
+                error = xfs_rtmodify_summary(mp, tp,
+                        XFS_RTBLOCKLOG(start - preblock),
+                        XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+        }
+        /*
+         * If there are blocks not being freed at the end of the
+         * old extent, add summary data for them to be allocated.
+         */
+        if (postblock > end) {
+                error = xfs_rtmodify_summary(mp, tp,
+                        XFS_RTBLOCKLOG(postblock - end),
+                        XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+                if (error) {
+                        return error;
+                }
+        }
+        /*
+         * Increment the summary information corresponding to the entire
+         * (new) free extent.
+         */
+        error = xfs_rtmodify_summary(mp, tp,
+                XFS_RTBLOCKLOG(postblock + 1 - preblock),
+                XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+        return error;
+}
+/*
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int                              /* error */
+xfs_rtget_summary(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        int             log,            /* log2 of extent size */
+        xfs_rtblock_t   bbno,           /* bitmap block number */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb,           /* in/out: summary block number */
+        xfs_suminfo_t   *sum)           /* out: summary info for this block */
+{
+        xfs_buf_t       *bp;            /* buffer for summary block */
+        int             error;          /* error value */
+        xfs_fsblock_t   sb;             /* summary fsblock */
+        int             so;             /* index into the summary file */
+        xfs_suminfo_t   *sp;            /* pointer to returned data */
+        /*
+         * Compute entry number in the summary file.
+         */
+        so = XFS_SUMOFFS(mp, log, bbno);
+        /*
+         * Compute the block number in the summary file.
+         */
+        sb = XFS_SUMOFFSTOBLOCK(mp, so);
+        /*
+         * If we have an old buffer, and the block number matches, use that.
+         */
+        if (rbpp && *rbpp && *rsb == sb)
+                bp = *rbpp;
+        /*
+         * Otherwise we have to get the buffer.
+         */
+        else {
+                /*
+                 * If there was an old one, get rid of it first.
+                 */
+                if (rbpp && *rbpp)
+                        xfs_trans_brelse(tp, *rbpp);
+                error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+                if (error) {
+                        return error;
+                }
+                /*
+                 * Remember this buffer and block for the next call.
+                 */
+                if (rbpp) {
+                        *rbpp = bp;
+                        *rsb = sb;
+                }
+        }
+        /*
+         * Point to the summary information & copy it out.
+         */
+        sp = XFS_SUMPTR(mp, bp, so);
+        *sum = *sp;
+        /*
+         * Drop the buffer if we're not asked to remember it.
+         */
+        if (!rbpp)
+                xfs_trans_brelse(tp, bp);
+        return 0;
+}
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+STATIC int                              /* error */
+xfs_rtmodify_range(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to modify */
+        xfs_extlen_t    len,            /* length of extent to modify */
+        int             val)            /* 1 for free, 0 for allocated */
+{
+        xfs_rtword_t    *b;             /* current word in buffer */
+        int             bit;            /* bit number in the word */
+        xfs_rtblock_t   block;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* buf for the block */
+        xfs_rtword_t    *bufp;          /* starting word in buffer */
+        int             error;          /* error value */
+        xfs_rtword_t    *first;         /* first used word in the buffer */
+        int             i;              /* current bit number rel. to start */
+        int             lastbit;        /* last useful bit in word */
+        xfs_rtword_t    mask;           /* mask o frelevant bits for value */
+        int             word;           /* word number in the buffer */
+        /*
+         * Compute starting bitmap block number.
+         */
+        block = XFS_BITTOBLOCK(mp, start);
+        /*
+         * Read the bitmap block, and point to its data.
+         */
+        error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+        if (error) {
+                return error;
+        }
+        bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+        /*
+         * Compute the starting word's address, and starting bit.
+         */
+        word = XFS_BITTOWORD(mp, start);
+        first = b = &bufp[word];
+        bit = (int)(start & (XFS_NBWORD - 1));
+        /*
+         * 0 (allocated) => all zeroes; 1 (free) => all ones.
+         */
+        val = -val;
+        /*
+         * If not starting on a word boundary, deal with the first
+         * (partial) word.
+         */
+        if (bit) {
+                /*
+                 * Compute first bit not changed and mask of relevant bits.
+                 */
+                lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+                mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+                /*
+                 * Set/clear the active bits.
+                 */
+                if (val)
+                        *b |= mask;
+                else
+                        *b &= ~mask;
+                i = lastbit - bit;
+                /*
+                 * Go on to the next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * Log the changed part of this block.
+                         * Get the next one.
+                         */
+                        xfs_trans_log_buf(tp, bp,
+                                (uint)((char *)first - (char *)bufp),
+                                (uint)((char *)b - (char *)bufp));
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer
+                         */
+                        b++;
+                }
+        } else {
+                /*
+                 * Starting on a word boundary, no partial word.
+                 */
+                i = 0;
+        }
+        /*
+         * Loop over whole words in buffers.  When we use up one buffer
+         * we move on to the next one.
+         */
+        while (len - i >= XFS_NBWORD) {
+                /*
+                 * Set the word value correctly.
+                 */
+                *b = val;
+                i += XFS_NBWORD;
+                /*
+                 * Go on to the next block if that's where the next word is
+                 * and we need the next word.
+                 */
+                if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+                        /*
+                         * Log the changed part of this block.
+                         * Get the next one.
+                         */
+                        xfs_trans_log_buf(tp, bp,
+                                (uint)((char *)first - (char *)bufp),
+                                (uint)((char *)b - (char *)bufp));
+                        error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+                        if (error) {
+                                return error;
+                        }
+                        first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+                        word = 0;
+                } else {
+                        /*
+                         * Go on to the next word in the buffer
+                         */
+                        b++;
+                }
+        }
+        /*
+         * If not ending on a word boundary, deal with the last
+         * (partial) word.
+         */
+        if ((lastbit = len - i)) {
+                /*
+                 * Compute a mask of relevant bits.
+                 */
+                bit = 0;
+                mask = ((xfs_rtword_t)1 << lastbit) - 1;
+                /*
+                 * Set/clear the active bits.
+                 */
+                if (val)
+                        *b |= mask;
+                else
+                        *b &= ~mask;
+                b++;
+        }
+        /*
+         * Log any remaining changed bytes.
+         */
+        if (b > first)
+                xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+                        (uint)((char *)b - (char *)bufp - 1));
+        return 0;
+}
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int                              /* error */
+xfs_rtmodify_summary(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        int             log,            /* log2 of extent size */
+        xfs_rtblock_t   bbno,           /* bitmap block number */
+        int             delta,          /* change to make to summary info */
+        xfs_buf_t       **rbpp,         /* in/out: summary block buffer */
+        xfs_fsblock_t   *rsb)           /* in/out: summary block number */
+{
+        xfs_buf_t       *bp;            /* buffer for the summary block */
+        int             error;          /* error value */
+        xfs_fsblock_t   sb;             /* summary fsblock */
+        int             so;             /* index into the summary file */
+        xfs_suminfo_t   *sp;            /* pointer to returned data */
+        /*
+         * Compute entry number in the summary file.
+         */
+        so = XFS_SUMOFFS(mp, log, bbno);
+        /*
+         * Compute the block number in the summary file.
+         */
+        sb = XFS_SUMOFFSTOBLOCK(mp, so);
+        /*
+         * If we have an old buffer, and the block number matches, use that.
+         */
+        if (rbpp && *rbpp && *rsb == sb)
+                bp = *rbpp;
+        /*
+         * Otherwise we have to get the buffer.
+         */
+        else {
+                /*
+                 * If there was an old one, get rid of it first.
+                 */
+                if (rbpp && *rbpp)
+                        xfs_trans_brelse(tp, *rbpp);
+                error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+                if (error) {
+                        return error;
+                }
+                /*
+                 * Remember this buffer and block for the next call.
+                 */
+                if (rbpp) {
+                        *rbpp = bp;
+                        *rsb = sb;
+                }
+        }
+        /*
+         * Point to the summary information, modify and log it.
+         */
+        sp = XFS_SUMPTR(mp, bp, so);
+        *sp += delta;
+        xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
+                (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
+        return 0;
+}
+/*
+ * Visible (exported) functions.
+ */
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_growfs_rt_t *in)            /* growfs rt input struct */
+{
+        xfs_rtblock_t   bmbno;          /* bitmap block number */
+        xfs_buf_t       *bp;            /* temporary buffer */
+        int             cancelflags;    /* flags for xfs_trans_cancel */
+        int             error;          /* error return value */
+        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
+        xfs_mount_t     *nmp;           /* new (fake) mount structure */
+        xfs_drfsbno_t   nrblocks;       /* new number of realtime blocks */
+        xfs_extlen_t    nrbmblocks;     /* new number of rt bitmap blocks */
+        xfs_drtbno_t    nrextents;      /* new number of realtime extents */
+        uint8_t         nrextslog;      /* new log2 of sb_rextents */
+        xfs_extlen_t    nrsumblocks;    /* new number of summary blocks */
+        uint            nrsumlevels;    /* new rt summary levels */
+        uint            nrsumsize;      /* new size of rt summary, bytes */
+        xfs_sb_t        *nsbp;          /* new superblock */
+        xfs_extlen_t    rbmblocks;      /* current number of rt bitmap blocks */
+        xfs_extlen_t    rsumblocks;     /* current number of rt summary blks */
+        xfs_sb_t        *sbp;           /* old superblock */
+        xfs_fsblock_t   sumbno;         /* summary block number */
+        xfs_trans_t     *tp;            /* transaction pointer */
+        sbp = &mp->m_sb;
+        /*
+         * Initial error checking.
+         */
+        if (mp->m_rtdev_targp || mp->m_rbmip == NULL ||
+            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
+            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+                return XFS_ERROR(EINVAL);
+        /*
+         * Read in the last block of the device, make sure it exists.
+         */
+        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+                        XFS_FSB_TO_BB(mp, in->newblocks - 1),
+                        XFS_FSB_TO_BB(mp, 1), 0, &bp);
+        if (error)
+                return error;
+        ASSERT(bp);
+        xfs_buf_relse(bp);
+        /*
+         * Calculate new parameters.  These are the final values to be reached.
+         */
+        nrextents = nrblocks;
+        do_div(nrextents, in->extsize);
+        nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize);
+        nrextslog = xfs_highbit32(nrextents);
+        nrsumlevels = nrextslog + 1;
+        nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
+        nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+        nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+        /*
+         * New summary size can't be more than half the size of
+         * the log.  This prevents us from getting a log overflow,
+         * since we'll log basically the whole summary file at once.
+         */
+        if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
+                return XFS_ERROR(EINVAL);
+        /*
+         * Get the old block counts for bitmap and summary inodes.
+         * These can't change since other growfs callers are locked out.
+         */
+        rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
+        rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
+        /*
+         * Allocate space to the bitmap and summary files, as necessary.
+         */
+        if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
+                        mp->m_sb.sb_rbmino)))
+                return error;
+        if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
+                        mp->m_sb.sb_rsumino)))
+                return error;
+        nmp = NULL;
+        /*
+         * Loop over the bitmap blocks.
+         * We will do everything one bitmap block at a time.
+         * Skip the current block if it is exactly full.
+         * This also deals with the case where there were no rtextents before.
+         */
+        for (bmbno = sbp->sb_rbmblocks -
+                     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
+             bmbno < nrbmblocks;
+             bmbno++) {
+                /*
+                 * Allocate a new (fake) mount/sb.
+                 */
+                nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+                *nmp = *mp;
+                nsbp = &nmp->m_sb;
+                /*
+                 * Calculate new sb and mount fields for this round.
+                 */
+                nsbp->sb_rextsize = in->extsize;
+                nsbp->sb_rbmblocks = bmbno + 1;
+                nsbp->sb_rblocks =
+                        XFS_RTMIN(nrblocks,
+                                  nsbp->sb_rbmblocks * NBBY *
+                                  nsbp->sb_blocksize * nsbp->sb_rextsize);
+                nsbp->sb_rextents = nsbp->sb_rblocks;
+                do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+                nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+                nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
+                nrsumsize =
+                        (uint)sizeof(xfs_suminfo_t) * nrsumlevels *
+                        nsbp->sb_rbmblocks;
+                nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+                nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+                /*
+                 * Start a transaction, get the log reservation.
+                 */
+                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
+                cancelflags = 0;
+                if ((error = xfs_trans_reserve(tp, 0,
+                                XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+                        goto error_exit;
+                /*
+                 * Lock out other callers by grabbing the bitmap inode lock.
+                 */
+                if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino,
+                                XFS_ILOCK_EXCL, &ip)))
+                        goto error_exit;
+                ASSERT(ip == mp->m_rbmip);
+                /*
+                 * Update the bitmap inode's size.
+                 */
+                mp->m_rbmip->i_d.di_size =
+                        nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+                cancelflags |= XFS_TRANS_ABORT;
+                /*
+                 * Get the summary inode into the transaction.
+                 */
+                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino,
+                                0, XFS_ILOCK_EXCL, &ip)))
+                        goto error_exit;
+                ASSERT(ip == mp->m_rsumip);
+                /*
+                 * Update the summary inode's size.
+                 */
+                mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+                xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
+                /*
+                 * Copy summary data from old to new sizes.
+                 * Do this when the real size (not block-aligned) changes.
+                 */
+                if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
+                    mp->m_rsumlevels != nmp->m_rsumlevels) {
+                        error = xfs_rtcopy_summary(mp, nmp, tp);
+                        if (error)
+                                goto error_exit;
+                }
+                /*
+                 * Update superblock fields.
+                 */
+                if (nsbp->sb_rextsize != sbp->sb_rextsize)
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+                                nsbp->sb_rextsize - sbp->sb_rextsize);
+                if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+                                nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
+                if (nsbp->sb_rblocks != sbp->sb_rblocks)
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+                                nsbp->sb_rblocks - sbp->sb_rblocks);
+                if (nsbp->sb_rextents != sbp->sb_rextents)
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+                                nsbp->sb_rextents - sbp->sb_rextents);
+                if (nsbp->sb_rextslog != sbp->sb_rextslog)
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+                                nsbp->sb_rextslog - sbp->sb_rextslog);
+                /*
+                 * Free new extent.
+                 */
+                bp = NULL;
+                error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
+                        nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+                if (error)
+                        goto error_exit;
+                /*
+                 * Mark more blocks free in the superblock.
+                 */
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
+                        nsbp->sb_rextents - sbp->sb_rextents);
+                /*
+                 * Free the fake mp structure.
+                 */
+                kmem_free(nmp, sizeof(*nmp));
+                nmp = NULL;
+                /*
+                 * Update mp values into the real mp structure.
+                 */
+                mp->m_rsumlevels = nrsumlevels;
+                mp->m_rsumsize = nrsumsize;
+                /*
+                 * Commit the transaction.
+                 */
+                xfs_trans_commit(tp, 0, NULL);
+        }
+        return 0;
+        /*
+         * Error paths come here.
+         */
+error_exit:
+        if (nmp)
+                kmem_free(nmp, sizeof(*nmp));
+        xfs_trans_cancel(tp, cancelflags);
+        return error;
+}
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int                                     /* error */
+xfs_rtallocate_extent(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number to allocate */
+        xfs_extlen_t    minlen,         /* minimum length to allocate */
+        xfs_extlen_t    maxlen,         /* maximum length to allocate */
+        xfs_extlen_t    *len,           /* out: actual length allocated */
+        xfs_alloctype_t type,           /* allocation type XFS_ALLOCTYPE... */
+        int             wasdel,         /* was a delayed allocation extent */
+        xfs_extlen_t    prod,           /* extent product factor */
+        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
+{
+        int             error;          /* error value */
+        xfs_inode_t     *ip;            /* inode for bitmap file */
+        xfs_mount_t     *mp;            /* file system mount structure */
+        xfs_rtblock_t   r;              /* result allocated block */
+        xfs_fsblock_t   sb;             /* summary file block number */
+        xfs_buf_t       *sumbp;         /* summary file block buffer */
+        ASSERT(minlen > 0 && minlen <= maxlen);
+        mp = tp->t_mountp;
+        /*
+         * If prod is set then figure out what to do to minlen and maxlen.
+         */
+        if (prod > 1) {
+                xfs_extlen_t    i;
+                if ((i = maxlen % prod))
+                        maxlen -= i;
+                if ((i = minlen % prod))
+                        minlen += prod - i;
+                if (maxlen < minlen) {
+                        *rtblock = NULLRTBLOCK;
+                        return 0;
+                }
+        }
+        /*
+         * Lock out other callers by grabbing the bitmap inode lock.
+         */
+        error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
+        if (error) {
+                return error;
+        }
+        sumbp = NULL;
+        /*
+         * Allocate by size, or near another block, or exactly at some block.
+         */
+        switch (type) {
+        case XFS_ALLOCTYPE_ANY_AG:
+                error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
+                                &sumbp, &sb, prod, &r);
+                break;
+        case XFS_ALLOCTYPE_NEAR_BNO:
+                error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
+                                len, &sumbp, &sb, prod, &r);
+                break;
+        case XFS_ALLOCTYPE_THIS_BNO:
+                error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
+                                len, &sumbp, &sb, prod, &r);
+                break;
+        default:
+                ASSERT(0);
+        }
+        if (error) {
+                return error;
+        }
+        /*
+         * If it worked, update the superblock.
+         */
+        if (r != NULLRTBLOCK) {
+                long    slen = (long)*len;
+                ASSERT(*len >= minlen && *len <= maxlen);
+                if (wasdel)
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
+                else
+                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+        }
+        *rtblock = r;
+        return 0;
+}
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int                                     /* error */
+xfs_rtfree_extent(
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   bno,            /* starting block number to free */
+        xfs_extlen_t    len)            /* length of extent freed */
+{
+        int             error;          /* error value */
+        xfs_inode_t     *ip;            /* bitmap file inode */
+        xfs_mount_t     *mp;            /* file system mount structure */
+        xfs_fsblock_t   sb;             /* summary file block number */
+        xfs_buf_t       *sumbp;         /* summary file block buffer */
+        mp = tp->t_mountp;
+        /*
+         * Synchronize by locking the bitmap inode.
+         */
+        error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
+        if (error) {
+                return error;
+        }
+#if defined(__KERNEL__) && defined(DEBUG)
+        /*
+         * Check to see that this whole range is currently allocated.
+         */
+        {
+                int     stat;           /* result from checking range */
+                error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
+                if (error) {
+                        return error;
+                }
+                ASSERT(stat);
+        }
+#endif
+        sumbp = NULL;
+        /*
+         * Free the range of realtime blocks.
+         */
+        error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+        if (error) {
+                return error;
+        }
+        /*
+         * Mark more blocks free in the superblock.
+         */
+        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+        /*
+         * If we've now freed all the blocks, reset the file sequence
+         * number to 0.
+         */
+        if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+            mp->m_sb.sb_rextents) {
+                if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+                        ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+                *(__uint64_t *)&ip->i_d.di_atime = 0;
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        }
+        return 0;
+}
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int                             /* error */
+xfs_rtmount_init(
+        xfs_mount_t     *mp)    /* file system mount structure */
+{
+        xfs_buf_t       *bp;    /* buffer for last block of subvolume */
+        xfs_daddr_t     d;      /* address of last block of subvolume */
+        int             error;  /* error return value */
+        xfs_sb_t        *sbp;   /* filesystem superblock copy in mount */
+        sbp = &mp->m_sb;
+        if (sbp->sb_rblocks == 0)
+                return 0;
+        if (mp->m_rtdev_targp == NULL) {
+                cmn_err(CE_WARN,
+        "XFS: This filesystem has a realtime volume, use rtdev=device option");
+                return XFS_ERROR(ENODEV);
+        }
+        mp->m_rsumlevels = sbp->sb_rextslog + 1;
+        mp->m_rsumsize =
+                (uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
+                sbp->sb_rbmblocks;
+        mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+        mp->m_rbmip = mp->m_rsumip = NULL;
+        /*
+         * Check that the realtime section is an ok size.
+         */
+        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
+                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
+                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
+                        (unsigned long long) mp->m_sb.sb_rblocks);
+                return XFS_ERROR(E2BIG);
+        }
+        error = xfs_read_buf(mp, mp->m_rtdev_targp,
+                                d - XFS_FSB_TO_BB(mp, 1),
+                                XFS_FSB_TO_BB(mp, 1), 0, &bp);
+        if (error) {
+                cmn_err(CE_WARN,
+        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
+                if (error == ENOSPC)
+                        return XFS_ERROR(E2BIG);
+                return error;
+        }
+        xfs_buf_relse(bp);
+        return 0;
+}
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int                                     /* error */
+xfs_rtmount_inodes(
+        xfs_mount_t     *mp)            /* file system mount structure */
+{
+        int             error;          /* error return value */
+        xfs_sb_t        *sbp;
+        sbp = &mp->m_sb;
+        if (sbp->sb_rbmino == NULLFSINO)
+                return 0;
+        error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0);
+        if (error)
+                return error;
+        ASSERT(mp->m_rbmip != NULL);
+        ASSERT(sbp->sb_rsumino != NULLFSINO);
+        error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
+        if (error) {
+                VN_RELE(XFS_ITOV(mp->m_rbmip));
+                return error;
+        }
+        ASSERT(mp->m_rsumip != NULL);
+        return 0;
+}
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int                                     /* error */
+xfs_rtpick_extent(
+        xfs_mount_t     *mp,            /* file system mount point */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_extlen_t    len,            /* allocation length (rtextents) */
+        xfs_rtblock_t   *pick)          /* result rt extent */
+{
+        xfs_rtblock_t   b;              /* result block */
+        int             error;          /* error return value */
+        xfs_inode_t     *ip;            /* bitmap incore inode */
+        int             log2;           /* log of sequence number */
+        __uint64_t      resid;          /* residual after log removed */
+        __uint64_t      seq;            /* sequence number of file creation */
+        __uint64_t      *seqp;          /* pointer to seqno in inode */
+        error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip);
+        if (error)
+                return error;
+        ASSERT(ip == mp->m_rbmip);
+        seqp = (__uint64_t *)&ip->i_d.di_atime;
+        if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
+                ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+                *seqp = 0;
+        }
+        seq = *seqp;
+        if ((log2 = xfs_highbit64(seq)) == -1)
+                b = 0;
+        else {
+                resid = seq - (1ULL << log2);
+                b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
+                    (log2 + 1);
+                if (b >= mp->m_sb.sb_rextents)
+                        b = do_mod(b, mp->m_sb.sb_rextents);
+                if (b + len > mp->m_sb.sb_rextents)
+                        b = mp->m_sb.sb_rextents - len;
+        }
+        *seqp = seq + 1;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        *pick = b;
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Debug code: print out the value of a range in the bitmap.
+ */
+void
+xfs_rtprint_range(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp,            /* transaction pointer */
+        xfs_rtblock_t   start,          /* starting block to print */
+        xfs_extlen_t    len)            /* length to print */
+{
+        xfs_extlen_t    i;              /* block number in the extent */
+        printk("%Ld: ", (long long)start);
+        for (i = 0; i < len; i++)
+                printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
+        printk("\n");
+}
+/*
+ * Debug code: print the summary file.
+ */
+void
+xfs_rtprint_summary(
+        xfs_mount_t     *mp,            /* file system mount structure */
+        xfs_trans_t     *tp)            /* transaction pointer */
+{
+        xfs_suminfo_t   c;              /* summary data */
+        xfs_rtblock_t   i;              /* bitmap block number */
+        int             l;              /* summary information level */
+        int             p;              /* flag for printed anything */
+        xfs_fsblock_t   sb;             /* summary block number */
+        xfs_buf_t       *sumbp;         /* summary block buffer */
+        sumbp = NULL;
+        for (l = 0; l < mp->m_rsumlevels; l++) {
+                for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+                        (void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c);
+                        if (c) {
+                                if (!p) {
+                                        printk("%Ld-%Ld:", 1LL << l,
+                                                XFS_RTMIN((1LL << l) +
+                                                          ((1LL << l) - 1LL),
+                                                         mp->m_sb.sb_rextents));
+                                        p = 1;
+                                }
+                                printk(" %Ld:%d", (long long)i, c);
+                        }
+                }
+                if (p)
+                        printk("\n");
+        }
+        if (sumbp)
+                xfs_trans_brelse(tp, sumbp);
+}
+#endif  /* DEBUG */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
new file mode 100644
index 000000000000..e2710264c054
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_RTALLOC_H__
+#define __XFS_RTALLOC_H__
+struct xfs_mount;
+struct xfs_trans;
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE       (1024 * 1024 * 1024)    /* 1GB */
+#define XFS_DFL_RTEXTSIZE       (64 * 1024)             /* 64KB */
+#define XFS_MIN_RTEXTSIZE       (4 * 1024)              /* 4KB */
+/*
+ * Constants for bit manipulations.
+ */
+#define XFS_NBBYLOG     3               /* log2(NBBY) */
+#define XFS_WORDLOG     2               /* log2(sizeof(xfs_rtword_t)) */
+#define XFS_NBWORDLOG   (XFS_NBBYLOG + XFS_WORDLOG)
+#define XFS_NBWORD      (1 << XFS_NBWORDLOG)
+#define XFS_WORDMASK    ((1 << XFS_WORDLOG) - 1)
+#define XFS_BLOCKSIZE(mp)       ((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp)       ((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp)      ((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp)      ((mp)->m_blockwmask)
+/*
+ * Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb)   ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s)        \
+        (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so)    \
+        ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
+                (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+#define XFS_BITTOBLOCK(mp,bi)   ((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb)   ((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi)    \
+        ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+#define XFS_RTMIN(a,b)  ((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b)  ((a) > (b) ? (a) : (b))
+#define XFS_RTLOBIT(w)  xfs_lowbit32(w)
+#define XFS_RTHIBIT(w)  xfs_highbit32(w)
+#if XFS_BIG_BLKNOS
+#define XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
+#else
+#define XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
+#endif
+#ifdef __KERNEL__
+#ifdef CONFIG_XFS_RT
+/*
+ * Function prototypes for exported functions.
+ */
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.  The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int                                     /* error */
+xfs_rtallocate_extent(
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_rtblock_t           bno,    /* starting block number to allocate */
+        xfs_extlen_t            minlen, /* minimum length to allocate */
+        xfs_extlen_t            maxlen, /* maximum length to allocate */
+        xfs_extlen_t            *len,   /* out: actual length allocated */
+        xfs_alloctype_t         type,   /* allocation type XFS_ALLOCTYPE... */
+        int                     wasdel, /* was a delayed allocation extent */
+        xfs_extlen_t            prod,   /* extent product factor */
+        xfs_rtblock_t           *rtblock); /* out: start block allocated */
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int                                     /* error */
+xfs_rtfree_extent(
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_rtblock_t           bno,    /* starting block number to free */
+        xfs_extlen_t            len);   /* length of extent freed */
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int                                     /* error */
+xfs_rtmount_init(
+        struct xfs_mount        *mp);   /* file system mount structure */
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int                                     /* error */
+xfs_rtmount_inodes(
+        struct xfs_mount        *mp);   /* file system mount structure */
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int                                     /* error */
+xfs_rtpick_extent(
+        struct xfs_mount        *mp,    /* file system mount point */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_extlen_t            len,    /* allocation length (rtextents) */
+        xfs_rtblock_t           *pick); /* result rt extent */
+/*
+ * Debug code: print out the value of a range in the bitmap.
+ */
+void
+xfs_rtprint_range(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_rtblock_t           start,  /* starting block to print */
+        xfs_extlen_t            len);   /* length to print */
+/*
+ * Debug code: print the summary file.
+ */
+void
+xfs_rtprint_summary(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        struct xfs_trans        *tp);   /* transaction pointer */
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        xfs_growfs_rt_t         *in);   /* user supplied growfs struct */
+#else
+# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)  (ENOSYS)
+# define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
+# define xfs_growfs_rt(mp,in)                           (ENOSYS)
+# define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+#endif  /* CONFIG_XFS_RT */
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
new file mode 100644
index 000000000000..d3ff7aef33ba
--- /dev/null
+++ b/fs/xfs/xfs_rw.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_itable.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_acl.h"
+#include "xfs_mac.h"
+#include "xfs_error.h"
+#include "xfs_buf_item.h"
+#include "xfs_rw.h"
+/*
+ * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
+ * which clears the setuid and setgid bits when a file is written.
+ */
+int
+xfs_write_clear_setuid(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp;
+        xfs_trans_t     *tp;
+        int             error;
+        mp = ip->i_mount;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+        if ((error = xfs_trans_reserve(tp, 0,
+                                      XFS_WRITEID_LOG_RES(mp),
+                                      0, 0, 0))) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        ip->i_d.di_mode &= ~S_ISUID;
+        /*
+         * Note that we don't have to worry about mandatory
+         * file locking being disabled here because we only
+         * clear the S_ISGID bit if the Group execute bit is
+         * on, but if it was on then mandatory locking wouldn't
+         * have been enabled.
+         */
+        if (ip->i_d.di_mode & S_IXGRP) {
+                ip->i_d.di_mode &= ~S_ISGID;
+        }
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return 0;
+}
+/*
+ * Force a shutdown of the filesystem instantly while keeping
+ * the filesystem consistent. We don't do an unmount here; just shutdown
+ * the shop, make sure that absolutely nothing persistent happens to
+ * this filesystem after this point.
+ */
+void
+xfs_do_force_shutdown(
+        bhv_desc_t      *bdp,
+        int             flags,
+        char            *fname,
+        int             lnnum)
+{
+        int             logerror;
+        xfs_mount_t     *mp;
+        mp = XFS_BHVTOM(bdp);
+        logerror = flags & XFS_LOG_IO_ERROR;
+        if (!(flags & XFS_FORCE_UMOUNT)) {
+                cmn_err(CE_NOTE,
+                "xfs_force_shutdown(%s,0x%x) called from line %d of file %s.  Return address = 0x%p",
+                        mp->m_fsname,flags,lnnum,fname,__return_address);
+        }
+        /*
+         * No need to duplicate efforts.
+         */
+        if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+                return;
+        /*
+         * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+         * queue up anybody new on the log reservations, and wakes up
+         * everybody who's sleeping on log reservations and tells
+         * them the bad news.
+         */
+        if (xfs_log_force_umount(mp, logerror))
+                return;
+        if (flags & XFS_CORRUPT_INCORE) {
+                xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
+    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
+                        mp->m_fsname);
+                if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
+                        xfs_stack_trace();
+                }
+        } else if (!(flags & XFS_FORCE_UMOUNT)) {
+                if (logerror) {
+                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
+                        "Log I/O Error Detected.  Shutting down filesystem: %s",
+                                mp->m_fsname);
+                } else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
+                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                                "I/O Error Detected.  Shutting down filesystem: %s",
+                                mp->m_fsname);
+                }
+        }
+        if (!(flags & XFS_FORCE_UMOUNT)) {
+                cmn_err(CE_ALERT,
+                "Please umount the filesystem, and rectify the problem(s)");
+        }
+}
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+int
+xfs_bioerror(
+        xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+        ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+        /*
+         * No need to wait until the buffer is unpinned.
+         * We aren't flushing it.
+         */
+        xfs_buftrace("XFS IOERROR", bp);
+        XFS_BUF_ERROR(bp, EIO);
+        /*
+         * We're calling biodone, so delete B_DONE flag. Either way
+         * we have to call the iodone callback, and calling biodone
+         * probably is the best way since it takes care of
+         * GRIO as well.
+         */
+        XFS_BUF_UNREAD(bp);
+        XFS_BUF_UNDELAYWRITE(bp);
+        XFS_BUF_UNDONE(bp);
+        XFS_BUF_STALE(bp);
+        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+        xfs_biodone(bp);
+        return (EIO);
+}
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+int
+xfs_bioerror_relse(
+        xfs_buf_t *bp)
+{
+        int64_t fl;
+        ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
+        ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
+        xfs_buftrace("XFS IOERRELSE", bp);
+        fl = XFS_BUF_BFLAGS(bp);
+        /*
+         * No need to wait until the buffer is unpinned.
+         * We aren't flushing it.
+         *
+         * chunkhold expects B_DONE to be set, whether
+         * we actually finish the I/O or not. We don't want to
+         * change that interface.
+         */
+        XFS_BUF_UNREAD(bp);
+        XFS_BUF_UNDELAYWRITE(bp);
+        XFS_BUF_DONE(bp);
+        XFS_BUF_STALE(bp);
+        XFS_BUF_CLR_IODONE_FUNC(bp);
+        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+        if (!(fl & XFS_B_ASYNC)) {
+                /*
+                 * Mark b_error and B_ERROR _both_.
+                 * Lot's of chunkcache code assumes that.
+                 * There's no reason to mark error for
+                 * ASYNC buffers.
+                 */
+                XFS_BUF_ERROR(bp, EIO);
+                XFS_BUF_V_IODONESEMA(bp);
+        } else {
+                xfs_buf_relse(bp);
+        }
+        return (EIO);
+}
+/*
+ * Prints out an ALERT message about I/O error.
+ */
+void
+xfs_ioerror_alert(
+        char                    *func,
+        struct xfs_mount        *mp,
+        xfs_buf_t               *bp,
+        xfs_daddr_t             blkno)
+{
+        cmn_err(CE_ALERT,
+ "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
+ "       (\"%s\") error %d buf count %u",
+                (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
+                XFS_BUFTARG_NAME(bp->pb_target),
+                (__uint64_t)blkno,
+                func,
+                XFS_BUF_GETERROR(bp),
+                XFS_BUF_COUNT(bp));
+}
+/*
+ * This isn't an absolute requirement, but it is
+ * just a good idea to call xfs_read_buf instead of
+ * directly doing a read_buf call. For one, we shouldn't
+ * be doing this disk read if we are in SHUTDOWN state anyway,
+ * so this stops that from happening. Secondly, this does all
+ * the error checking stuff and the brelse if appropriate for
+ * the caller, so the code can be a little leaner.
+ */
+int
+xfs_read_buf(
+        struct xfs_mount *mp,
+        xfs_buftarg_t    *target,
+        xfs_daddr_t      blkno,
+        int              len,
+        uint             flags,
+        xfs_buf_t        **bpp)
+{
+        xfs_buf_t        *bp;
+        int              error;
+        if (flags)
+                bp = xfs_buf_read_flags(target, blkno, len, flags);
+        else
+                bp = xfs_buf_read(target, blkno, len, flags);
+        if (!bp)
+                return XFS_ERROR(EIO);
+        error = XFS_BUF_GETERROR(bp);
+        if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
+                *bpp = bp;
+        } else {
+                *bpp = NULL;
+                if (error) {
+                        xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+                } else {
+                        error = XFS_ERROR(EIO);
+                }
+                if (bp) {
+                        XFS_BUF_UNDONE(bp);
+                        XFS_BUF_UNDELAYWRITE(bp);
+                        XFS_BUF_STALE(bp);
+                        /*
+                         * brelse clears B_ERROR and b_error
+                         */
+                        xfs_buf_relse(bp);
+                }
+        }
+        return (error);
+}
+/*
+ * Wrapper around bwrite() so that we can trap
+ * write errors, and act accordingly.
+ */
+int
+xfs_bwrite(
+        struct xfs_mount *mp,
+        struct xfs_buf   *bp)
+{
+        int     error;
+        /*
+         * XXXsup how does this work for quotas.
+         */
+        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+        XFS_BUF_SET_FSPRIVATE3(bp, mp);
+        XFS_BUF_WRITE(bp);
+        if ((error = XFS_bwrite(bp))) {
+                ASSERT(mp);
+                /*
+                 * Cannot put a buftrace here since if the buffer is not
+                 * B_HOLD then we will brelse() the buffer before returning
+                 * from bwrite and we could be tracing a buffer that has
+                 * been reused.
+                 */
+                xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+        }
+        return (error);
+}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
new file mode 100644
index 000000000000..c8b10bf8f530
--- /dev/null
+++ b/fs/xfs/xfs_rw.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_RW_H__
+#define __XFS_RW_H__
+struct xfs_buf;
+struct xfs_inode;
+struct xfs_mount;
+/*
+ * Maximum count of bmaps used by read and write paths.
+ */
+#define XFS_MAX_RW_NBMAPS       4
+/*
+ * Counts of readahead buffers to use based on physical memory size.
+ * None of these should be more than XFS_MAX_RW_NBMAPS.
+ */
+#define XFS_RW_NREADAHEAD_16MB  2
+#define XFS_RW_NREADAHEAD_32MB  3
+#define XFS_RW_NREADAHEAD_K32   4
+#define XFS_RW_NREADAHEAD_K64   4
+/*
+ * Maximum size of a buffer that we\'ll map.  Making this
+ * too big will degrade performance due to the number of
+ * pages which need to be gathered.  Making it too small
+ * will prevent us from doing large I/O\'s to hardware that
+ * needs it.
+ *
+ * This is currently set to 512 KB.
+ */
+#define XFS_MAX_BMAP_LEN_BB     1024
+#define XFS_MAX_BMAP_LEN_BYTES  524288
+/*
+ * Convert the given file system block to a disk block.
+ * We have to treat it differently based on whether the
+ * file is a real time file or not, because the bmap code
+ * does.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DB)
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+#define XFS_FSB_TO_DB(ip,fsb)   xfs_fsb_to_db(ip,fsb)
+#else
+#define XFS_FSB_TO_DB(ip,fsb) \
+                (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) ? \
+                 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+                 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)))
+#endif
+#define XFS_FSB_TO_DB_IO(io,fsb) \
+                (((io)->io_flags & XFS_IOCORE_RT) ? \
+                 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
+                 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
+/*
+ * Prototypes for functions in xfs_rw.c.
+ */
+int
+xfs_write_clear_setuid(
+        struct xfs_inode        *ip);
+int
+xfs_bwrite(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp);
+int
+xfs_bioerror(
+        struct xfs_buf          *b);
+int
+xfs_bioerror_relse(
+        struct xfs_buf          *b);
+int
+xfs_read_buf(
+        struct xfs_mount        *mp,
+        xfs_buftarg_t           *target,
+        xfs_daddr_t             blkno,
+        int                     len,
+        uint                    flags,
+        struct xfs_buf          **bpp);
+void
+xfs_ioerror_alert(
+        char                    *func,
+        struct xfs_mount        *mp,
+        xfs_buf_t               *bp,
+        xfs_daddr_t             blkno);
+/*
+ * Prototypes for functions in xfs_vnodeops.c.
+ */
+int
+xfs_rwlock(
+        bhv_desc_t              *bdp,
+        vrwlock_t               write_lock);
+void
+xfs_rwunlock(
+        bhv_desc_t              *bdp,
+        vrwlock_t               write_lock);
+int
+xfs_change_file_space(
+        bhv_desc_t              *bdp,
+        int                     cmd,
+        xfs_flock64_t           *bf,
+        xfs_off_t               offset,
+        cred_t                  *credp,
+        int                     flags);
+int
+xfs_set_dmattrs(
+        bhv_desc_t              *bdp,
+        u_int                   evmask,
+        u_int16_t               state,
+        cred_t                  *credp);
+#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
new file mode 100644
index 000000000000..ad090a834ced
--- /dev/null
+++ b/fs/xfs/xfs_sb.h
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SB_H__
+#define __XFS_SB_H__
+/*
+ * Super block
+ * Fits into a sector-sized buffer at address 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+struct xfs_buf;
+struct xfs_mount;
+#define XFS_SB_MAGIC            0x58465342      /* 'XFSB' */
+#define XFS_SB_VERSION_1        1               /* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2        2               /* 6.2 - attributes */
+#define XFS_SB_VERSION_3        3               /* 6.2 - new inode version */
+#define XFS_SB_VERSION_4        4               /* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_NUMBITS          0x000f
+#define XFS_SB_VERSION_ALLFBITS         0xfff0
+#define XFS_SB_VERSION_SASHFBITS        0xf000
+#define XFS_SB_VERSION_REALFBITS        0x0ff0
+#define XFS_SB_VERSION_ATTRBIT          0x0010
+#define XFS_SB_VERSION_NLINKBIT         0x0020
+#define XFS_SB_VERSION_QUOTABIT         0x0040
+#define XFS_SB_VERSION_ALIGNBIT         0x0080
+#define XFS_SB_VERSION_DALIGNBIT        0x0100
+#define XFS_SB_VERSION_SHAREDBIT        0x0200
+#define XFS_SB_VERSION_LOGV2BIT         0x0400
+#define XFS_SB_VERSION_SECTORBIT        0x0800
+#define XFS_SB_VERSION_EXTFLGBIT        0x1000
+#define XFS_SB_VERSION_DIRV2BIT         0x2000
+#define XFS_SB_VERSION_MOREBITSBIT      0x8000
+#define XFS_SB_VERSION_OKSASHFBITS      \
+        (XFS_SB_VERSION_EXTFLGBIT | \
+         XFS_SB_VERSION_DIRV2BIT)
+#define XFS_SB_VERSION_OKREALFBITS      \
+        (XFS_SB_VERSION_ATTRBIT | \
+         XFS_SB_VERSION_NLINKBIT | \
+         XFS_SB_VERSION_QUOTABIT | \
+         XFS_SB_VERSION_ALIGNBIT | \
+         XFS_SB_VERSION_DALIGNBIT | \
+         XFS_SB_VERSION_SHAREDBIT | \
+         XFS_SB_VERSION_LOGV2BIT | \
+         XFS_SB_VERSION_SECTORBIT)
+#define XFS_SB_VERSION_OKSASHBITS       \
+        (XFS_SB_VERSION_NUMBITS | \
+         XFS_SB_VERSION_REALFBITS | \
+         XFS_SB_VERSION_OKSASHFBITS)
+#define XFS_SB_VERSION_OKREALBITS       \
+        (XFS_SB_VERSION_NUMBITS | \
+         XFS_SB_VERSION_OKREALFBITS | \
+         XFS_SB_VERSION_OKSASHFBITS)
+#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na,sflag,morebits)     \
+        (((ia) || (dia) || (extflag) || (dirv2) || (na) || (sflag) || \
+          (morebits)) ? \
+                (XFS_SB_VERSION_4 | \
+                 ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \
+                 ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \
+                 ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \
+                 ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \
+                 ((na) ? XFS_SB_VERSION_LOGV2BIT : 0) | \
+                 ((sflag) ? XFS_SB_VERSION_SECTORBIT : 0) | \
+                 ((morebits) ? XFS_SB_VERSION_MOREBITSBIT : 0)) : \
+                XFS_SB_VERSION_1)
+/*
+ * There are two words to hold XFS "feature" bits: the original
+ * word, sb_versionnum, and sb_features2.  Whenever a bit is set in
+ * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set.
+ *
+ * These defines represent bits in sb_features2.
+ */
+#define XFS_SB_VERSION2_REALFBITS       0x00ffffff      /* Mask: features */
+#define XFS_SB_VERSION2_RESERVED1BIT    0x00000001
+#define XFS_SB_VERSION2_SASHFBITS       0xff000000      /* Mask: features that
+                                                           require changing
+                                                           PROM and SASH */
+#define XFS_SB_VERSION2_OKREALFBITS     \
+        (0)
+#define XFS_SB_VERSION2_OKSASHFBITS     \
+        (0)
+#define XFS_SB_VERSION2_OKREALBITS      \
+        (XFS_SB_VERSION2_OKREALFBITS |  \
+         XFS_SB_VERSION2_OKSASHFBITS )
+/*
+ * mkfs macro to set up sb_features2 word
+ */
+#define XFS_SB_VERSION2_MKFS(xyz)       \
+        ((xyz) ? 0 : 0)
+typedef struct xfs_sb
+{
+        __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
+        __uint32_t      sb_blocksize;   /* logical block size, bytes */
+        xfs_drfsbno_t   sb_dblocks;     /* number of data blocks */
+        xfs_drfsbno_t   sb_rblocks;     /* number of realtime blocks */
+        xfs_drtbno_t    sb_rextents;    /* number of realtime extents */
+        uuid_t          sb_uuid;        /* file system unique id */
+        xfs_dfsbno_t    sb_logstart;    /* starting block of log if internal */
+        xfs_ino_t       sb_rootino;     /* root inode number */
+        xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
+        xfs_ino_t       sb_rsumino;     /* summary inode for rt bitmap */
+        xfs_agblock_t   sb_rextsize;    /* realtime extent size, blocks */
+        xfs_agblock_t   sb_agblocks;    /* size of an allocation group */
+        xfs_agnumber_t  sb_agcount;     /* number of allocation groups */
+        xfs_extlen_t    sb_rbmblocks;   /* number of rt bitmap blocks */
+        xfs_extlen_t    sb_logblocks;   /* number of log blocks */
+        __uint16_t      sb_versionnum;  /* header version == XFS_SB_VERSION */
+        __uint16_t      sb_sectsize;    /* volume sector size, bytes */
+        __uint16_t      sb_inodesize;   /* inode size, bytes */
+        __uint16_t      sb_inopblock;   /* inodes per block */
+        char            sb_fname[12];   /* file system name */
+        __uint8_t       sb_blocklog;    /* log2 of sb_blocksize */
+        __uint8_t       sb_sectlog;     /* log2 of sb_sectsize */
+        __uint8_t       sb_inodelog;    /* log2 of sb_inodesize */
+        __uint8_t       sb_inopblog;    /* log2 of sb_inopblock */
+        __uint8_t       sb_agblklog;    /* log2 of sb_agblocks (rounded up) */
+        __uint8_t       sb_rextslog;    /* log2 of sb_rextents */
+        __uint8_t       sb_inprogress;  /* mkfs is in progress, don't mount */
+        __uint8_t       sb_imax_pct;    /* max % of fs for inode space */
+                                        /* statistics */
+        /*
+         * These fields must remain contiguous.  If you really
+         * want to change their layout, make sure you fix the
+         * code in xfs_trans_apply_sb_deltas().
+         */
+        __uint64_t      sb_icount;      /* allocated inodes */
+        __uint64_t      sb_ifree;       /* free inodes */
+        __uint64_t      sb_fdblocks;    /* free data blocks */
+        __uint64_t      sb_frextents;   /* free realtime extents */
+        /*
+         * End contiguous fields.
+         */
+        xfs_ino_t       sb_uquotino;    /* user quota inode */
+        xfs_ino_t       sb_gquotino;    /* group quota inode */
+        __uint16_t      sb_qflags;      /* quota flags */
+        __uint8_t       sb_flags;       /* misc. flags */
+        __uint8_t       sb_shared_vn;   /* shared version number */
+        xfs_extlen_t    sb_inoalignmt;  /* inode chunk alignment, fsblocks */
+        __uint32_t      sb_unit;        /* stripe or raid unit */
+        __uint32_t      sb_width;       /* stripe or raid width */
+        __uint8_t       sb_dirblklog;   /* log2 of dir block size (fsbs) */
+        __uint8_t       sb_logsectlog;  /* log2 of the log sector size */
+        __uint16_t      sb_logsectsize; /* sector size for the log, bytes */
+        __uint32_t      sb_logsunit;    /* stripe unit size for the log */
+        __uint32_t      sb_features2;   /* additonal feature bits */
+} xfs_sb_t;
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+        XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+        XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+        XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+        XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+        XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+        XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+        XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+        XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+        XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+        XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+        XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+        XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
+        XFS_SBS_FEATURES2,
+        XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
+#define XFS_SB_UUID             XFS_SB_MVAL(UUID)
+#define XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
+#define XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
+#define XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
+#define XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
+#define XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO         XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO         XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS           XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN        XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT             XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH            XFS_SB_MVAL(WIDTH)
+#define XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
+#define XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
+#define XFS_SB_MOD_BITS         \
+        (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH)
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS         0x00    /* no flags set */
+#define XFS_SBF_READONLY        0x01    /* only read-only mounts allowed */
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN    0
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_NUM)
+int xfs_sb_version_num(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_NUM(sbp) xfs_sb_version_num(sbp)
+#else
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_GOOD_VERSION)
+int xfs_sb_good_version(xfs_sb_t *sbp);
+#define XFS_SB_GOOD_VERSION(sbp)        xfs_sb_good_version(sbp)
+#else
+#define XFS_SB_GOOD_VERSION_INT(sbp)    \
+        ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
+          ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
+           ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+            !(((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
+              (((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
+               ((sbp)->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+#ifdef __KERNEL__
+#define XFS_SB_GOOD_VERSION(sbp)        \
+        (XFS_SB_GOOD_VERSION_INT(sbp) && \
+          (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN) ))
+#else
+/*
+ * extra 2 paren's here (( to unconfuse paren-matching editors
+ * like vi because XFS_SB_GOOD_VERSION_INT is a partial expression
+ * and the two XFS_SB_GOOD_VERSION's each 2 more close paren's to
+ * complete the expression.
+ */
+#define XFS_SB_GOOD_VERSION(sbp)        \
+        (XFS_SB_GOOD_VERSION_INT(sbp) && \
+          (!((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
+           (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)) ))
+#endif /* __KERNEL__ */
+#endif
+#define XFS_SB_GOOD_SASH_VERSION(sbp)   \
+        ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
+          ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
+         ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+          !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS)))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TONEW)
+unsigned xfs_sb_version_tonew(unsigned v);
+#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v)
+#else
+#define XFS_SB_VERSION_TONEW(v) \
+        ((((v) == XFS_SB_VERSION_1) ? \
+                0 : \
+                (((v) == XFS_SB_VERSION_2) ? \
+                        XFS_SB_VERSION_ATTRBIT : \
+                        (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
+         XFS_SB_VERSION_4)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TOOLD)
+unsigned xfs_sb_version_toold(unsigned v);
+#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v)
+#else
+#define XFS_SB_VERSION_TOOLD(v) \
+        (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
+                0 : \
+                (((v) & XFS_SB_VERSION_NLINKBIT) ? \
+                        XFS_SB_VERSION_3 : \
+                        (((v) & XFS_SB_VERSION_ATTRBIT) ?  \
+                                XFS_SB_VERSION_2 : \
+                                XFS_SB_VERSION_1)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASATTR)
+int xfs_sb_version_hasattr(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASATTR(sbp)     xfs_sb_version_hasattr(sbp)
+#else
+#define XFS_SB_VERSION_HASATTR(sbp)     \
+        (((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
+         ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+         ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+          ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDATTR)
+void xfs_sb_version_addattr(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDATTR(sbp)     xfs_sb_version_addattr(sbp)
+#else
+#define XFS_SB_VERSION_ADDATTR(sbp)     \
+        ((sbp)->sb_versionnum = \
+         (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
+                XFS_SB_VERSION_2 : \
+                ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
+                        ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
+                        (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASNLINK)
+int xfs_sb_version_hasnlink(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASNLINK(sbp)    xfs_sb_version_hasnlink(sbp)
+#else
+#define XFS_SB_VERSION_HASNLINK(sbp)    \
+        (((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+         ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+          ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDNLINK)
+void xfs_sb_version_addnlink(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDNLINK(sbp)    xfs_sb_version_addnlink(sbp)
+#else
+#define XFS_SB_VERSION_ADDNLINK(sbp)    \
+        ((sbp)->sb_versionnum = \
+         ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
+                XFS_SB_VERSION_3 : \
+                ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASQUOTA)
+int xfs_sb_version_hasquota(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASQUOTA(sbp)    xfs_sb_version_hasquota(sbp)
+#else
+#define XFS_SB_VERSION_HASQUOTA(sbp)    \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+         ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDQUOTA)
+void xfs_sb_version_addquota(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDQUOTA(sbp)    xfs_sb_version_addquota(sbp)
+#else
+#define XFS_SB_VERSION_ADDQUOTA(sbp)    \
+        ((sbp)->sb_versionnum = \
+         (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
+                ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
+                (XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \
+                 XFS_SB_VERSION_QUOTABIT)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASALIGN)
+int xfs_sb_version_hasalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASALIGN(sbp)    xfs_sb_version_hasalign(sbp)
+#else
+#define XFS_SB_VERSION_HASALIGN(sbp)    \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+         ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBALIGN)
+void xfs_sb_version_subalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBALIGN(sbp)    xfs_sb_version_subalign(sbp)
+#else
+#define XFS_SB_VERSION_SUBALIGN(sbp)    \
+        ((sbp)->sb_versionnum = \
+         XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDALIGN)
+int xfs_sb_version_hasdalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASDALIGN(sbp)   xfs_sb_version_hasdalign(sbp)
+#else
+#define XFS_SB_VERSION_HASDALIGN(sbp)   \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+         ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDDALIGN)
+int xfs_sb_version_adddalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDDALIGN(sbp)   xfs_sb_version_adddalign(sbp)
+#else
+#define XFS_SB_VERSION_ADDDALIGN(sbp)   \
+        ((sbp)->sb_versionnum = \
+                ((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSHARED)
+int xfs_sb_version_hasshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASSHARED(sbp)   xfs_sb_version_hasshared(sbp)
+#else
+#define XFS_SB_VERSION_HASSHARED(sbp)   \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+         ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDSHARED)
+int xfs_sb_version_addshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDSHARED(sbp)   xfs_sb_version_addshared(sbp)
+#else
+#define XFS_SB_VERSION_ADDSHARED(sbp)   \
+        ((sbp)->sb_versionnum = \
+                ((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBSHARED)
+int xfs_sb_version_subshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBSHARED(sbp)   xfs_sb_version_subshared(sbp)
+#else
+#define XFS_SB_VERSION_SUBSHARED(sbp)   \
+        ((sbp)->sb_versionnum = \
+                ((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDIRV2)
+int xfs_sb_version_hasdirv2(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASDIRV2(sbp)    xfs_sb_version_hasdirv2(sbp)
+#else
+#define XFS_SB_VERSION_HASDIRV2(sbp)    \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+         ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASLOGV2)
+int xfs_sb_version_haslogv2(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASLOGV2(sbp)   xfs_sb_version_haslogv2(sbp)
+#else
+#define XFS_SB_VERSION_HASLOGV2(sbp)   \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
+int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASEXTFLGBIT(sbp)        xfs_sb_version_hasextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_HASEXTFLGBIT(sbp)        \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+         ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDEXTFLGBIT)
+int xfs_sb_version_addextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp)        xfs_sb_version_addextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp)        \
+        ((sbp)->sb_versionnum = \
+                ((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBEXTFLGBIT)
+int xfs_sb_version_subextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp)        xfs_sb_version_subextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp)        \
+        ((sbp)->sb_versionnum = \
+                ((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSECTOR)
+int xfs_sb_version_hassector(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASSECTOR(sbp)   xfs_sb_version_hassector(sbp)
+#else
+#define XFS_SB_VERSION_HASSECTOR(sbp)   \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASMOREBITSBIT)
+int xfs_sb_version_hasmorebits(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASMOREBITS(sbp) xfs_sb_version_hasmorebits(sbp)
+#else
+#define XFS_SB_VERSION_HASMOREBITS(sbp) \
+        ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+         ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT))
+#endif
+/*
+ * sb_features2 bit version macros.
+ *
+ * For example, for a bit defined as XFS_SB_VERSION2_YBIT, has a macro:
+ *
+ * SB_VERSION_HASYBIT(xfs_sb_t *sbp)
+ *      ((XFS_SB_VERSION_HASMOREBITS(sbp) &&
+ *       ((sbp)->sb_versionnum & XFS_SB_VERSION2_YBIT)
+ */
+/*
+ * end of superblock version macros
+ */
+#define XFS_SB_DADDR    ((xfs_daddr_t)0)        /* daddr in filesystem/ag */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_BLOCK)
+xfs_agblock_t xfs_sb_block(struct xfs_mount *mp);
+#define XFS_SB_BLOCK(mp)        xfs_sb_block(mp)
+#else
+#define XFS_SB_BLOCK(mp)        XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_HDR_BLOCK)
+xfs_agblock_t xfs_hdr_block(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_HDR_BLOCK(mp,d)     xfs_hdr_block(mp,d)
+#else
+#define XFS_HDR_BLOCK(mp,d)     ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp,d)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_FSB)
+xfs_fsblock_t xfs_daddr_to_fsb(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_FSB(mp,d)          xfs_daddr_to_fsb(mp,d)
+#else
+#define XFS_DADDR_TO_FSB(mp,d) \
+        XFS_AGB_TO_FSB(mp, XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DADDR)
+xfs_daddr_t xfs_fsb_to_daddr(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_DADDR(mp,fsbno)      xfs_fsb_to_daddr(mp,fsbno)
+#else
+#define XFS_FSB_TO_DADDR(mp,fsbno) \
+        XFS_AGB_TO_DADDR(mp, XFS_FSB_TO_AGNO(mp,fsbno), \
+                         XFS_FSB_TO_AGBNO(mp,fsbno))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBP)
+xfs_sb_t *xfs_buf_to_sbp(struct xfs_buf *bp);
+#define XFS_BUF_TO_SBP(bp)      xfs_buf_to_sbp(bp)
+#else
+#define XFS_BUF_TO_SBP(bp)      ((xfs_sb_t *)XFS_BUF_PTR(bp))
+#endif
+/*
+ * File system sector to basic block conversions.
+ */
+#define XFS_FSS_TO_BB(mp,sec)   ((sec) << (mp)->m_sectbb_log)
+#define XFS_BB_TO_FSS(mp,bb)    \
+        (((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log)
+#define XFS_BB_TO_FSST(mp,bb)   ((bb) >> (mp)->m_sectbb_log)
+/*
+ * File system sector to byte conversions.
+ */
+#define XFS_FSS_TO_B(mp,sectno) ((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog)
+#define XFS_B_TO_FSST(mp,b)     (((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog)
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb)    \
+        (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
+#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)  ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)      \
+        ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)     (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b)  ((b) & (mp)->m_blockmask)
+#endif  /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
new file mode 100644
index 000000000000..3db0e2200775
--- /dev/null
+++ b/fs/xfs/xfs_trans.c
@@ -0,0 +1,1315 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_trans_priv.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota.h"
+#include "xfs_trans_space.h"
+STATIC void     xfs_trans_apply_sb_deltas(xfs_trans_t *);
+STATIC uint     xfs_trans_count_vecs(xfs_trans_t *);
+STATIC void     xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
+STATIC void     xfs_trans_uncommit(xfs_trans_t *, uint);
+STATIC void     xfs_trans_committed(xfs_trans_t *, int);
+STATIC void     xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
+STATIC void     xfs_trans_free(xfs_trans_t *);
+kmem_zone_t             *xfs_trans_zone;
+/*
+ * Initialize the precomputed transaction reservation values
+ * in the mount structure.
+ */
+void
+xfs_trans_init(
+        xfs_mount_t     *mp)
+{
+        xfs_trans_reservations_t        *resp;
+        resp = &(mp->m_reservations);
+        resp->tr_write =
+                (uint)(XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_itruncate =
+                (uint)(XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_rename =
+                (uint)(XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_link = (uint)XFS_CALC_LINK_LOG_RES(mp);
+        resp->tr_remove =
+                (uint)(XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_symlink =
+                (uint)(XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_create =
+                (uint)(XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_mkdir =
+                (uint)(XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_ifree =
+                (uint)(XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_ichange =
+                (uint)(XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_growdata = (uint)XFS_CALC_GROWDATA_LOG_RES(mp);
+        resp->tr_swrite = (uint)XFS_CALC_SWRITE_LOG_RES(mp);
+        resp->tr_writeid = (uint)XFS_CALC_WRITEID_LOG_RES(mp);
+        resp->tr_addafork =
+                (uint)(XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_attrinval = (uint)XFS_CALC_ATTRINVAL_LOG_RES(mp);
+        resp->tr_attrset =
+                (uint)(XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_attrrm =
+                (uint)(XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+        resp->tr_clearagi = (uint)XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+        resp->tr_growrtalloc = (uint)XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+        resp->tr_growrtzero = (uint)XFS_CALC_GROWRTZERO_LOG_RES(mp);
+        resp->tr_growrtfree = (uint)XFS_CALC_GROWRTFREE_LOG_RES(mp);
+}
+/*
+ * This routine is called to allocate a transaction structure.
+ * The type parameter indicates the type of the transaction.  These
+ * are enumerated in xfs_trans.h.
+ *
+ * Dynamically allocate the transaction structure from the transaction
+ * zone, initialize it, and return it to the caller.
+ */
+xfs_trans_t *
+xfs_trans_alloc(
+        xfs_mount_t     *mp,
+        uint            type)
+{
+        fs_check_frozen(XFS_MTOVFS(mp), SB_FREEZE_TRANS);
+        atomic_inc(&mp->m_active_trans);
+        return (_xfs_trans_alloc(mp, type));
+}
+xfs_trans_t *
+_xfs_trans_alloc(
+        xfs_mount_t     *mp,
+        uint            type)
+{
+        xfs_trans_t     *tp;
+        ASSERT(xfs_trans_zone != NULL);
+        tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+        /*
+         * Initialize the transaction structure.
+         */
+        tp->t_magic = XFS_TRANS_MAGIC;
+        tp->t_type = type;
+        tp->t_mountp = mp;
+        tp->t_items_free = XFS_LIC_NUM_SLOTS;
+        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
+        XFS_LIC_INIT(&(tp->t_items));
+        XFS_LBC_INIT(&(tp->t_busy));
+        return (tp);
+}
+/*
+ * This is called to create a new transaction which will share the
+ * permanent log reservation of the given transaction.  The remaining
+ * unused block and rt extent reservations are also inherited.  This
+ * implies that the original transaction is no longer allowed to allocate
+ * blocks.  Locks and log items, however, are no inherited.  They must
+ * be added to the new transaction explicitly.
+ */
+xfs_trans_t *
+xfs_trans_dup(
+        xfs_trans_t     *tp)
+{
+        xfs_trans_t     *ntp;
+        ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+        /*
+         * Initialize the new transaction structure.
+         */
+        ntp->t_magic = XFS_TRANS_MAGIC;
+        ntp->t_type = tp->t_type;
+        ntp->t_mountp = tp->t_mountp;
+        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
+        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
+        XFS_LIC_INIT(&(ntp->t_items));
+        XFS_LBC_INIT(&(ntp->t_busy));
+        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+        ASSERT(!xlog_debug || tp->t_ticket != NULL);
+#else
+        ASSERT(tp->t_ticket != NULL);
+#endif
+        ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
+        ntp->t_ticket = tp->t_ticket;
+        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+        tp->t_blk_res = tp->t_blk_res_used;
+        ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
+        tp->t_rtx_res = tp->t_rtx_res_used;
+        PFLAGS_DUP(&tp->t_pflags, &ntp->t_pflags);
+        XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
+        atomic_inc(&tp->t_mountp->m_active_trans);
+        return ntp;
+}
+/*
+ * This is called to reserve free disk blocks and log space for the
+ * given transaction.  This must be done before allocating any resources
+ * within the transaction.
+ *
+ * This will return ENOSPC if there are not enough blocks available.
+ * It will sleep waiting for available log space.
+ * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
+ * is used by long running transactions.  If any one of the reservations
+ * fails then they will all be backed out.
+ *
+ * This does not do quota reservations. That typically is done by the
+ * caller afterwards.
+ */
+int
+xfs_trans_reserve(
+        xfs_trans_t     *tp,
+        uint            blocks,
+        uint            logspace,
+        uint            rtextents,
+        uint            flags,
+        uint            logcount)
+{
+        int             log_flags;
+        int             error;
+        int     rsvd;
+        error = 0;
+        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+        /* Mark this thread as being in a transaction */
+        PFLAGS_SET_FSTRANS(&tp->t_pflags);
+        /*
+         * Attempt to reserve the needed disk blocks by decrementing
+         * the number needed from the number available.  This will
+         * fail if the count would go below zero.
+         */
+        if (blocks > 0) {
+                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                                          -blocks, rsvd);
+                if (error != 0) {
+                        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+                        return (XFS_ERROR(ENOSPC));
+                }
+                tp->t_blk_res += blocks;
+        }
+        /*
+         * Reserve the log space needed for this transaction.
+         */
+        if (logspace > 0) {
+                ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
+                ASSERT((tp->t_log_count == 0) ||
+                        (tp->t_log_count == logcount));
+                if (flags & XFS_TRANS_PERM_LOG_RES) {
+                        log_flags = XFS_LOG_PERM_RESERV;
+                        tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+                } else {
+                        ASSERT(tp->t_ticket == NULL);
+                        ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+                        log_flags = 0;
+                }
+                error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
+                                        &tp->t_ticket,
+                                        XFS_TRANSACTION, log_flags);
+                if (error) {
+                        goto undo_blocks;
+                }
+                tp->t_log_res = logspace;
+                tp->t_log_count = logcount;
+        }
+        /*
+         * Attempt to reserve the needed realtime extents by decrementing
+         * the number needed from the number available.  This will
+         * fail if the count would go below zero.
+         */
+        if (rtextents > 0) {
+                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
+                                          -rtextents, rsvd);
+                if (error) {
+                        error = XFS_ERROR(ENOSPC);
+                        goto undo_log;
+                }
+                tp->t_rtx_res += rtextents;
+        }
+        return 0;
+        /*
+         * Error cases jump to one of these labels to undo any
+         * reservations which have already been performed.
+         */
+undo_log:
+        if (logspace > 0) {
+                if (flags & XFS_TRANS_PERM_LOG_RES) {
+                        log_flags = XFS_LOG_REL_PERM_RESERV;
+                } else {
+                        log_flags = 0;
+                }
+                xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+                tp->t_ticket = NULL;
+                tp->t_log_res = 0;
+                tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
+        }
+undo_blocks:
+        if (blocks > 0) {
+                (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                                         blocks, rsvd);
+                tp->t_blk_res = 0;
+        }
+        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+        return (error);
+}
+/*
+ * This is called to set the a callback to be called when the given
+ * transaction is committed to disk.  The transaction pointer and the
+ * argument pointer will be passed to the callback routine.
+ *
+ * Only one callback can be associated with any single transaction.
+ */
+void
+xfs_trans_callback(
+        xfs_trans_t             *tp,
+        xfs_trans_callback_t    callback,
+        void                    *arg)
+{
+        ASSERT(tp->t_callback == NULL);
+        tp->t_callback = callback;
+        tp->t_callarg = arg;
+}
+/*
+ * Record the indicated change to the given field for application
+ * to the file system's superblock when the transaction commits.
+ * For now, just store the change in the transaction structure.
+ *
+ * Mark the transaction structure to indicate that the superblock
+ * needs to be updated before committing.
+ */
+void
+xfs_trans_mod_sb(
+        xfs_trans_t     *tp,
+        uint            field,
+        long            delta)
+{
+        switch (field) {
+        case XFS_TRANS_SB_ICOUNT:
+                tp->t_icount_delta += delta;
+                break;
+        case XFS_TRANS_SB_IFREE:
+                tp->t_ifree_delta += delta;
+                break;
+        case XFS_TRANS_SB_FDBLOCKS:
+                /*
+                 * Track the number of blocks allocated in the
+                 * transaction.  Make sure it does not exceed the
+                 * number reserved.
+                 */
+                if (delta < 0) {
+                        tp->t_blk_res_used += (uint)-delta;
+                        ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
+                }
+                tp->t_fdblocks_delta += delta;
+                break;
+        case XFS_TRANS_SB_RES_FDBLOCKS:
+                /*
+                 * The allocation has already been applied to the
+                 * in-core superblock's counter.  This should only
+                 * be applied to the on-disk superblock.
+                 */
+                ASSERT(delta < 0);
+                tp->t_res_fdblocks_delta += delta;
+                break;
+        case XFS_TRANS_SB_FREXTENTS:
+                /*
+                 * Track the number of blocks allocated in the
+                 * transaction.  Make sure it does not exceed the
+                 * number reserved.
+                 */
+                if (delta < 0) {
+                        tp->t_rtx_res_used += (uint)-delta;
+                        ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
+                }
+                tp->t_frextents_delta += delta;
+                break;
+        case XFS_TRANS_SB_RES_FREXTENTS:
+                /*
+                 * The allocation has already been applied to the
+                 * in-core superblocks's counter.  This should only
+                 * be applied to the on-disk superblock.
+                 */
+                ASSERT(delta < 0);
+                tp->t_res_frextents_delta += delta;
+                break;
+        case XFS_TRANS_SB_DBLOCKS:
+                ASSERT(delta > 0);
+                tp->t_dblocks_delta += delta;
+                break;
+        case XFS_TRANS_SB_AGCOUNT:
+                ASSERT(delta > 0);
+                tp->t_agcount_delta += delta;
+                break;
+        case XFS_TRANS_SB_IMAXPCT:
+                tp->t_imaxpct_delta += delta;
+                break;
+        case XFS_TRANS_SB_REXTSIZE:
+                tp->t_rextsize_delta += delta;
+                break;
+        case XFS_TRANS_SB_RBMBLOCKS:
+                tp->t_rbmblocks_delta += delta;
+                break;
+        case XFS_TRANS_SB_RBLOCKS:
+                tp->t_rblocks_delta += delta;
+                break;
+        case XFS_TRANS_SB_REXTENTS:
+                tp->t_rextents_delta += delta;
+                break;
+        case XFS_TRANS_SB_REXTSLOG:
+                tp->t_rextslog_delta += delta;
+                break;
+        default:
+                ASSERT(0);
+                return;
+        }
+        tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY);
+}
+/*
+ * xfs_trans_apply_sb_deltas() is called from the commit code
+ * to bring the superblock buffer into the current transaction
+ * and modify it as requested by earlier calls to xfs_trans_mod_sb().
+ *
+ * For now we just look at each field allowed to change and change
+ * it if necessary.
+ */
+STATIC void
+xfs_trans_apply_sb_deltas(
+        xfs_trans_t     *tp)
+{
+        xfs_sb_t        *sbp;
+        xfs_buf_t       *bp;
+        int             whole = 0;
+        bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+        sbp = XFS_BUF_TO_SBP(bp);
+        /*
+         * Check that superblock mods match the mods made to AGF counters.
+         */
+        ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
+               (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
+                tp->t_ag_btree_delta));
+        if (tp->t_icount_delta != 0) {
+                INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta);
+        }
+        if (tp->t_ifree_delta != 0) {
+                INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta);
+        }
+        if (tp->t_fdblocks_delta != 0) {
+                INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta);
+        }
+        if (tp->t_res_fdblocks_delta != 0) {
+                INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta);
+        }
+        if (tp->t_frextents_delta != 0) {
+                INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_frextents_delta);
+        }
+        if (tp->t_res_frextents_delta != 0) {
+                INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_res_frextents_delta);
+        }
+        if (tp->t_dblocks_delta != 0) {
+                INT_MOD(sbp->sb_dblocks, ARCH_CONVERT, tp->t_dblocks_delta);
+                whole = 1;
+        }
+        if (tp->t_agcount_delta != 0) {
+                INT_MOD(sbp->sb_agcount, ARCH_CONVERT, tp->t_agcount_delta);
+                whole = 1;
+        }
+        if (tp->t_imaxpct_delta != 0) {
+                INT_MOD(sbp->sb_imax_pct, ARCH_CONVERT, tp->t_imaxpct_delta);
+                whole = 1;
+        }
+        if (tp->t_rextsize_delta != 0) {
+                INT_MOD(sbp->sb_rextsize, ARCH_CONVERT, tp->t_rextsize_delta);
+                whole = 1;
+        }
+        if (tp->t_rbmblocks_delta != 0) {
+                INT_MOD(sbp->sb_rbmblocks, ARCH_CONVERT, tp->t_rbmblocks_delta);
+                whole = 1;
+        }
+        if (tp->t_rblocks_delta != 0) {
+                INT_MOD(sbp->sb_rblocks, ARCH_CONVERT, tp->t_rblocks_delta);
+                whole = 1;
+        }
+        if (tp->t_rextents_delta != 0) {
+                INT_MOD(sbp->sb_rextents, ARCH_CONVERT, tp->t_rextents_delta);
+                whole = 1;
+        }
+        if (tp->t_rextslog_delta != 0) {
+                INT_MOD(sbp->sb_rextslog, ARCH_CONVERT, tp->t_rextslog_delta);
+                whole = 1;
+        }
+        if (whole)
+                /*
+                 * Log the whole thing, the fields are discontiguous.
+                 */
+                xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1);
+        else
+                /*
+                 * Since all the modifiable fields are contiguous, we
+                 * can get away with this.
+                 */
+                xfs_trans_log_buf(tp, bp, offsetof(xfs_sb_t, sb_icount),
+                                  offsetof(xfs_sb_t, sb_frextents) +
+                                  sizeof(sbp->sb_frextents) - 1);
+        XFS_MTOVFS(tp->t_mountp)->vfs_super->s_dirt = 1;
+}
+/*
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused
+ * reservations and apply superblock counter changes to the in-core
+ * superblock.
+ *
+ * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
+ */
+void
+xfs_trans_unreserve_and_mod_sb(
+        xfs_trans_t     *tp)
+{
+        xfs_mod_sb_t    msb[14];        /* If you add cases, add entries */
+        xfs_mod_sb_t    *msbp;
+        /* REFERENCED */
+        int             error;
+        int             rsvd;
+        msbp = msb;
+        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+        /*
+         * Release any reserved blocks.  Any that were allocated
+         * will be taken back again by fdblocks_delta below.
+         */
+        if (tp->t_blk_res > 0) {
+                msbp->msb_field = XFS_SBS_FDBLOCKS;
+                msbp->msb_delta = tp->t_blk_res;
+                msbp++;
+        }
+        /*
+         * Release any reserved real time extents .  Any that were
+         * allocated will be taken back again by frextents_delta below.
+         */
+        if (tp->t_rtx_res > 0) {
+                msbp->msb_field = XFS_SBS_FREXTENTS;
+                msbp->msb_delta = tp->t_rtx_res;
+                msbp++;
+        }
+        /*
+         * Apply any superblock modifications to the in-core version.
+         * The t_res_fdblocks_delta and t_res_frextents_delta fields are
+         * explicity NOT applied to the in-core superblock.
+         * The idea is that that has already been done.
+         */
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+                if (tp->t_icount_delta != 0) {
+                        msbp->msb_field = XFS_SBS_ICOUNT;
+                        msbp->msb_delta = (int)tp->t_icount_delta;
+                        msbp++;
+                }
+                if (tp->t_ifree_delta != 0) {
+                        msbp->msb_field = XFS_SBS_IFREE;
+                        msbp->msb_delta = (int)tp->t_ifree_delta;
+                        msbp++;
+                }
+                if (tp->t_fdblocks_delta != 0) {
+                        msbp->msb_field = XFS_SBS_FDBLOCKS;
+                        msbp->msb_delta = (int)tp->t_fdblocks_delta;
+                        msbp++;
+                }
+                if (tp->t_frextents_delta != 0) {
+                        msbp->msb_field = XFS_SBS_FREXTENTS;
+                        msbp->msb_delta = (int)tp->t_frextents_delta;
+                        msbp++;
+                }
+                if (tp->t_dblocks_delta != 0) {
+                        msbp->msb_field = XFS_SBS_DBLOCKS;
+                        msbp->msb_delta = (int)tp->t_dblocks_delta;
+                        msbp++;
+                }
+                if (tp->t_agcount_delta != 0) {
+                        msbp->msb_field = XFS_SBS_AGCOUNT;
+                        msbp->msb_delta = (int)tp->t_agcount_delta;
+                        msbp++;
+                }
+                if (tp->t_imaxpct_delta != 0) {
+                        msbp->msb_field = XFS_SBS_IMAX_PCT;
+                        msbp->msb_delta = (int)tp->t_imaxpct_delta;
+                        msbp++;
+                }
+                if (tp->t_rextsize_delta != 0) {
+                        msbp->msb_field = XFS_SBS_REXTSIZE;
+                        msbp->msb_delta = (int)tp->t_rextsize_delta;
+                        msbp++;
+                }
+                if (tp->t_rbmblocks_delta != 0) {
+                        msbp->msb_field = XFS_SBS_RBMBLOCKS;
+                        msbp->msb_delta = (int)tp->t_rbmblocks_delta;
+                        msbp++;
+                }
+                if (tp->t_rblocks_delta != 0) {
+                        msbp->msb_field = XFS_SBS_RBLOCKS;
+                        msbp->msb_delta = (int)tp->t_rblocks_delta;
+                        msbp++;
+                }
+                if (tp->t_rextents_delta != 0) {
+                        msbp->msb_field = XFS_SBS_REXTENTS;
+                        msbp->msb_delta = (int)tp->t_rextents_delta;
+                        msbp++;
+                }
+                if (tp->t_rextslog_delta != 0) {
+                        msbp->msb_field = XFS_SBS_REXTSLOG;
+                        msbp->msb_delta = (int)tp->t_rextslog_delta;
+                        msbp++;
+                }
+        }
+        /*
+         * If we need to change anything, do it.
+         */
+        if (msbp > msb) {
+                error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
+                        (uint)(msbp - msb), rsvd);
+                ASSERT(error == 0);
+        }
+}
+/*
+ * xfs_trans_commit
+ *
+ * Commit the given transaction to the log a/synchronously.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * Do not reference the transaction structure after this call.
+ */
+ /*ARGSUSED*/
+int
+xfs_trans_commit(
+        xfs_trans_t     *tp,
+        uint            flags,
+        xfs_lsn_t       *commit_lsn_p)
+{
+        xfs_log_iovec_t         *log_vector;
+        int                     nvec;
+        xfs_mount_t             *mp;
+        xfs_lsn_t               commit_lsn;
+        /* REFERENCED */
+        int                     error;
+        int                     log_flags;
+        int                     sync;
+#define XFS_TRANS_LOGVEC_COUNT  16
+        xfs_log_iovec_t         log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+        static xfs_lsn_t        trans_lsn = 1;
+#endif
+        void                    *commit_iclog;
+        int                     shutdown;
+        commit_lsn = -1;
+        /*
+         * Determine whether this commit is releasing a permanent
+         * log reservation or not.
+         */
+        if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+                ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        } else {
+                log_flags = 0;
+        }
+        mp = tp->t_mountp;
+        /*
+         * If there is nothing to be logged by the transaction,
+         * then unlock all of the items associated with the
+         * transaction and free the transaction structure.
+         * Also make sure to return any reserved blocks to
+         * the free pool.
+         */
+shut_us_down:
+        shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
+        if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
+                xfs_trans_unreserve_and_mod_sb(tp);
+                /*
+                 * It is indeed possible for the transaction to be
+                 * not dirty but the dqinfo portion to be. All that
+                 * means is that we have some (non-persistent) quota
+                 * reservations that need to be unreserved.
+                 */
+                XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
+                if (tp->t_ticket) {
+                        commit_lsn = xfs_log_done(mp, tp->t_ticket,
+                                                        NULL, log_flags);
+                        if (commit_lsn == -1 && !shutdown)
+                                shutdown = XFS_ERROR(EIO);
+                }
+                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+                xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
+                xfs_trans_free_busy(tp);
+                xfs_trans_free(tp);
+                XFS_STATS_INC(xs_trans_empty);
+                if (commit_lsn_p)
+                        *commit_lsn_p = commit_lsn;
+                return (shutdown);
+        }
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+        ASSERT(!xlog_debug || tp->t_ticket != NULL);
+#else
+        ASSERT(tp->t_ticket != NULL);
+#endif
+        /*
+         * If we need to update the superblock, then do it now.
+         */
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+                xfs_trans_apply_sb_deltas(tp);
+        }
+        XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp);
+        /*
+         * Ask each log item how many log_vector entries it will
+         * need so we can figure out how many to allocate.
+         * Try to avoid the kmem_alloc() call in the common case
+         * by using a vector from the stack when it fits.
+         */
+        nvec = xfs_trans_count_vecs(tp);
+        if (nvec == 0) {
+                xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+                goto shut_us_down;
+        }
+        if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
+                log_vector = log_vector_fast;
+        } else {
+                log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec *
+                                                   sizeof(xfs_log_iovec_t),
+                                                   KM_SLEEP);
+        }
+        /*
+         * Fill in the log_vector and pin the logged items, and
+         * then write the transaction to the log.
+         */
+        xfs_trans_fill_vecs(tp, log_vector);
+        /*
+         * Ignore errors here. xfs_log_done would do the right thing.
+         * We need to put the ticket, etc. away.
+         */
+        error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket,
+                             &(tp->t_lsn));
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+        if (xlog_debug) {
+                commit_lsn = xfs_log_done(mp, tp->t_ticket,
+                                          &commit_iclog, log_flags);
+        } else {
+                commit_lsn = 0;
+                tp->t_lsn = trans_lsn++;
+        }
+#else
+        /*
+         * This is the regular case.  At this point (after the call finishes),
+         * the transaction is committed incore and could go out to disk at
+         * any time.  However, all the items associated with the transaction
+         * are still locked and pinned in memory.
+         */
+        commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
+#endif
+        tp->t_commit_lsn = commit_lsn;
+        if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+                kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
+        }
+        if (commit_lsn_p)
+                *commit_lsn_p = commit_lsn;
+        /*
+         * If we got a log write error. Unpin the logitems that we
+         * had pinned, clean up, free trans structure, and return error.
+         */
+        if (error || commit_lsn == -1) {
+                PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+                xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
+                return XFS_ERROR(EIO);
+        }
+        /*
+         * Once the transaction has committed, unused
+         * reservations need to be released and changes to
+         * the superblock need to be reflected in the in-core
+         * version.  Do that now.
+         */
+        xfs_trans_unreserve_and_mod_sb(tp);
+        sync = tp->t_flags & XFS_TRANS_SYNC;
+        /*
+         * Tell the LM to call the transaction completion routine
+         * when the log write with LSN commit_lsn completes (e.g.
+         * when the transaction commit really hits the on-disk log).
+         * After this call we cannot reference tp, because the call
+         * can happen at any time and the call will free the transaction
+         * structure pointed to by tp.  The only case where we call
+         * the completion routine (xfs_trans_committed) directly is
+         * if the log is turned off on a debug kernel or we're
+         * running in simulation mode (the log is explicitly turned
+         * off).
+         */
+        tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+        tp->t_logcb.cb_arg = tp;
+        /*
+         * We need to pass the iclog buffer which was used for the
+         * transaction commit record into this function, and attach
+         * the callback to it. The callback must be attached before
+         * the items are unlocked to avoid racing with other threads
+         * waiting for an item to unlock.
+         */
+        shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb));
+        /*
+         * Mark this thread as no longer being in a transaction
+         */
+        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+        /*
+         * Once all the items of the transaction have been copied
+         * to the in core log and the callback is attached, the
+         * items can be unlocked.
+         *
+         * This will free descriptors pointing to items which were
+         * not logged since there is nothing more to do with them.
+         * For items which were logged, we will keep pointers to them
+         * so they can be unpinned after the transaction commits to disk.
+         * This will also stamp each modified meta-data item with
+         * the commit lsn of this transaction for dependency tracking
+         * purposes.
+         */
+        xfs_trans_unlock_items(tp, commit_lsn);
+        /*
+         * If we detected a log error earlier, finish committing
+         * the transaction now (unpin log items, etc).
+         *
+         * Order is critical here, to avoid using the transaction
+         * pointer after its been freed (by xfs_trans_committed
+         * either here now, or as a callback).  We cannot do this
+         * step inside xfs_log_notify as was done earlier because
+         * of this issue.
+         */
+        if (shutdown)
+                xfs_trans_committed(tp, XFS_LI_ABORTED);
+        /*
+         * Now that the xfs_trans_committed callback has been attached,
+         * and the items are released we can finally allow the iclog to
+         * go to disk.
+         */
+        error = xfs_log_release_iclog(mp, commit_iclog);
+        /*
+         * If the transaction needs to be synchronous, then force the
+         * log out now and wait for it.
+         */
+        if (sync) {
+                if (!error)
+                        error = xfs_log_force(mp, commit_lsn,
+                                      XFS_LOG_FORCE | XFS_LOG_SYNC);
+                XFS_STATS_INC(xs_trans_sync);
+        } else {
+                XFS_STATS_INC(xs_trans_async);
+        }
+        return (error);
+}
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.  The transaction itself needs one for the
+ * transaction header.  Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+STATIC uint
+xfs_trans_count_vecs(
+        xfs_trans_t     *tp)
+{
+        int                     nvecs;
+        xfs_log_item_desc_t     *lidp;
+        nvecs = 1;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp != NULL);
+        /* In the non-debug case we need to start bailing out if we
+         * didn't find a log_item here, return zero and let trans_commit
+         * deal with it.
+         */
+        if (lidp == NULL)
+                return 0;
+        while (lidp != NULL) {
+                /*
+                 * Skip items which aren't dirty in this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                lidp->lid_size = IOP_SIZE(lidp->lid_item);
+                nvecs += lidp->lid_size;
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        return nvecs;
+}
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+        xfs_trans_t     *tp,
+        uint            flags)
+{
+        xfs_log_item_desc_t     *lidp;
+        for (lidp = xfs_trans_first_item(tp);
+             lidp != NULL;
+             lidp = xfs_trans_next_item(tp, lidp)) {
+                /*
+                 * Unpin all but those that aren't dirty.
+                 */
+                if (lidp->lid_flags & XFS_LID_DIRTY)
+                        IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+        }
+        xfs_trans_unreserve_and_mod_sb(tp);
+        XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
+        xfs_trans_free_items(tp, flags);
+        xfs_trans_free_busy(tp);
+        xfs_trans_free(tp);
+}
+/*
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.  The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
+ *
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+STATIC void
+xfs_trans_fill_vecs(
+        xfs_trans_t             *tp,
+        xfs_log_iovec_t         *log_vector)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_iovec_t         *vecp;
+        uint                    nitems;
+        /*
+         * Skip over the entry for the transaction header, we'll
+         * fill that in at the end.
+         */
+        vecp = log_vector + 1;          /* pointer arithmetic */
+        nitems = 0;
+        lidp = xfs_trans_first_item(tp);
+        ASSERT(lidp != NULL);
+        while (lidp != NULL) {
+                /*
+                 * Skip items which aren't dirty in this transaction.
+                 */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                /*
+                 * The item may be marked dirty but not log anything.
+                 * This can be used to get called when a transaction
+                 * is committed.
+                 */
+                if (lidp->lid_size) {
+                        nitems++;
+                }
+                IOP_FORMAT(lidp->lid_item, vecp);
+                vecp += lidp->lid_size;         /* pointer arithmetic */
+                IOP_PIN(lidp->lid_item);
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        /*
+         * Now that we've counted the number of items in this
+         * transaction, fill in the transaction header.
+         */
+        tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+        tp->t_header.th_type = tp->t_type;
+        tp->t_header.th_num_items = nitems;
+        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+        log_vector->i_len = sizeof(xfs_trans_header_t);
+}
+/*
+ * Unlock all of the transaction's items and free the transaction.
+ * The transaction must not have modified any of its items, because
+ * there is no way to restore them to their previous state.
+ *
+ * If the transaction has made a log reservation, make sure to release
+ * it as well.
+ */
+void
+xfs_trans_cancel(
+        xfs_trans_t             *tp,
+        int                     flags)
+{
+        int                     log_flags;
+#ifdef DEBUG
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_item_t          *lip;
+        int                     i;
+#endif
+        /*
+         * See if the caller is being too lazy to figure out if
+         * the transaction really needs an abort.
+         */
+        if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
+                flags &= ~XFS_TRANS_ABORT;
+        /*
+         * See if the caller is relying on us to shut down the
+         * filesystem.  This happens in paths where we detect
+         * corruption and decide to give up.
+         */
+        if ((tp->t_flags & XFS_TRANS_DIRTY) &&
+            !XFS_FORCED_SHUTDOWN(tp->t_mountp))
+                xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE);
+#ifdef DEBUG
+        if (!(flags & XFS_TRANS_ABORT)) {
+                licp = &(tp->t_items);
+                while (licp != NULL) {
+                        lidp = licp->lic_descs;
+                        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                                if (XFS_LIC_ISFREE(licp, i)) {
+                                        continue;
+                                }
+                                lip = lidp->lid_item;
+                                if (!XFS_FORCED_SHUTDOWN(tp->t_mountp))
+                                        ASSERT(!(lip->li_type == XFS_LI_EFD));
+                        }
+                        licp = licp->lic_next;
+                }
+        }
+#endif
+        xfs_trans_unreserve_and_mod_sb(tp);
+        XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
+        if (tp->t_ticket) {
+                if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+                        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+                        log_flags = XFS_LOG_REL_PERM_RESERV;
+                } else {
+                        log_flags = 0;
+                }
+                xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags);
+        }
+        /* mark this thread as no longer being in a transaction */
+        PFLAGS_RESTORE_FSTRANS(&tp->t_pflags);
+        xfs_trans_free_items(tp, flags);
+        xfs_trans_free_busy(tp);
+        xfs_trans_free(tp);
+}
+/*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+        xfs_trans_t     *tp)
+{
+        atomic_dec(&tp->t_mountp->m_active_trans);
+        XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+        kmem_zone_free(xfs_trans_zone, tp);
+}
+/*
+ * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
+ *
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ *
+ * Call xfs_trans_chunk_committed() to process the items in
+ * each chunk.
+ */
+STATIC void
+xfs_trans_committed(
+        xfs_trans_t     *tp,
+        int             abortflag)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_chunk_t    *next_licp;
+        xfs_log_busy_chunk_t    *lbcp;
+        xfs_log_busy_slot_t     *lbsp;
+        int                     i;
+        /*
+         * Call the transaction's completion callback if there
+         * is one.
+         */
+        if (tp->t_callback != NULL) {
+                tp->t_callback(tp, tp->t_callarg);
+        }
+        /*
+         * Special case the chunk embedded in the transaction.
+         */
+        licp = &(tp->t_items);
+        if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
+        }
+        /*
+         * Process the items in each chunk in turn.
+         */
+        licp = licp->lic_next;
+        while (licp != NULL) {
+                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
+                next_licp = licp->lic_next;
+                kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                licp = next_licp;
+        }
+        /*
+         * Clear all the per-AG busy list items listed in this transaction
+         */
+        lbcp = &tp->t_busy;
+        while (lbcp != NULL) {
+                for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
+                        if (!XFS_LBC_ISFREE(lbcp, i)) {
+                                xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
+                                                     lbsp->lbc_idx);
+                        }
+                }
+                lbcp = lbcp->lbc_next;
+        }
+        xfs_trans_free_busy(tp);
+        /*
+         * That's it for the transaction structure.  Free it.
+         */
+        xfs_trans_free(tp);
+}
+/*
+ * This is called to perform the commit processing for each
+ * item described by the given chunk.
+ *
+ * The commit processing consists of unlocking items which were
+ * held locked with the SYNC_UNLOCK attribute, calling the committed
+ * routine of each logged item, updating the item's position in the AIL
+ * if necessary, and unpinning each item.  If the committed routine
+ * returns -1, then do nothing further with the item because it
+ * may have been freed.
+ *
+ * Since items are unlocked when they are copied to the incore
+ * log, it is possible for two transactions to be completing
+ * and manipulating the same item simultaneously.  The AIL lock
+ * will protect the lsn field of each item.  The value of this
+ * field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because
+ * otherwise they could be immediately flushed and we'd have to race
+ * with the flusher trying to pull the item from the AIL as we add it.
+ */
+STATIC void
+xfs_trans_chunk_committed(
+        xfs_log_item_chunk_t    *licp,
+        xfs_lsn_t               lsn,
+        int                     aborted)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_item_t          *lip;
+        xfs_lsn_t               item_lsn;
+        struct xfs_mount        *mp;
+        int                     i;
+        SPLDECL(s);
+        lidp = licp->lic_descs;
+        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                if (XFS_LIC_ISFREE(licp, i)) {
+                        continue;
+                }
+                lip = lidp->lid_item;
+                if (aborted)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                /*
+                 * Send in the ABORTED flag to the COMMITTED routine
+                 * so that it knows whether the transaction was aborted
+                 * or not.
+                 */
+                item_lsn = IOP_COMMITTED(lip, lsn);
+                /*
+                 * If the committed routine returns -1, make
+                 * no more references to the item.
+                 */
+                if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
+                        continue;
+                }
+                /*
+                 * If the returned lsn is greater than what it
+                 * contained before, update the location of the
+                 * item in the AIL.  If it is not, then do nothing.
+                 * Items can never move backwards in the AIL.
+                 *
+                 * While the new lsn should usually be greater, it
+                 * is possible that a later transaction completing
+                 * simultaneously with an earlier one using the
+                 * same item could complete first with a higher lsn.
+                 * This would cause the earlier transaction to fail
+                 * the test below.
+                 */
+                mp = lip->li_mountp;
+                AIL_LOCK(mp,s);
+                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+                        /*
+                         * This will set the item's lsn to item_lsn
+                         * and update the position of the item in
+                         * the AIL.
+                         *
+                         * xfs_trans_update_ail() drops the AIL lock.
+                         */
+                        xfs_trans_update_ail(mp, lip, item_lsn, s);
+                } else {
+                        AIL_UNLOCK(mp, s);
+                }
+                /*
+                 * Now that we've repositioned the item in the AIL,
+                 * unpin it so it can be flushed. Pass information
+                 * about buffer stale state down from the log item
+                 * flags, if anyone else stales the buffer we do not
+                 * want to pay any attention to it.
+                 */
+                IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE);
+        }
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
new file mode 100644
index 000000000000..bd37ccb85e76
--- /dev/null
+++ b/fs/xfs/xfs_trans.h
@@ -0,0 +1,1042 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_H__
+#define __XFS_TRANS_H__
+/*
+ * This is the structure written in the log at the head of
+ * every transaction. It identifies the type and id of the
+ * transaction, and contains the number of items logged by
+ * the transaction so we know how many to expect during recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+        uint            th_magic;               /* magic number */
+        uint            th_type;                /* transaction type */
+        __int32_t       th_tid;                 /* transaction id (unused) */
+        uint            th_num_items;           /* num items logged by trans */
+} xfs_trans_header_t;
+#define XFS_TRANS_HEADER_MAGIC  0x5452414e      /* TRAN */
+/*
+ * Log item types.
+ */
+#define XFS_LI_5_3_BUF          0x1234  /* v1 bufs, 1-block inode buffers */
+#define XFS_LI_5_3_INODE        0x1235  /* 1-block inode buffers */
+#define XFS_LI_EFI              0x1236
+#define XFS_LI_EFD              0x1237
+#define XFS_LI_IUNLINK          0x1238
+#define XFS_LI_6_1_INODE        0x1239  /* 4K non-aligned inode bufs */
+#define XFS_LI_6_1_BUF          0x123a  /* v1, 4K inode buffers */
+#define XFS_LI_INODE            0x123b  /* aligned ino chunks, var-size ibufs */
+#define XFS_LI_BUF              0x123c  /* v2 bufs, variable sized inode bufs */
+#define XFS_LI_DQUOT            0x123d
+#define XFS_LI_QUOTAOFF         0x123e
+/*
+ * Transaction types.  Used to distinguish types of buffers.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE      1
+#define XFS_TRANS_SETATTR_SIZE          2
+#define XFS_TRANS_INACTIVE              3
+#define XFS_TRANS_CREATE                4
+#define XFS_TRANS_CREATE_TRUNC          5
+#define XFS_TRANS_TRUNCATE_FILE         6
+#define XFS_TRANS_REMOVE                7
+#define XFS_TRANS_LINK                  8
+#define XFS_TRANS_RENAME                9
+#define XFS_TRANS_MKDIR                 10
+#define XFS_TRANS_RMDIR                 11
+#define XFS_TRANS_SYMLINK               12
+#define XFS_TRANS_SET_DMATTRS           13
+#define XFS_TRANS_GROWFS                14
+#define XFS_TRANS_STRAT_WRITE           15
+#define XFS_TRANS_DIOSTRAT              16
+#define XFS_TRANS_WRITE_SYNC            17
+#define XFS_TRANS_WRITEID               18
+#define XFS_TRANS_ADDAFORK              19
+#define XFS_TRANS_ATTRINVAL             20
+#define XFS_TRANS_ATRUNCATE             21
+#define XFS_TRANS_ATTR_SET              22
+#define XFS_TRANS_ATTR_RM               23
+#define XFS_TRANS_ATTR_FLAG             24
+#define XFS_TRANS_CLEAR_AGI_BUCKET      25
+#define XFS_TRANS_QM_SBCHANGE           26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1                27
+#define XFS_TRANS_DUMMY2                28
+#define XFS_TRANS_QM_QUOTAOFF           29
+#define XFS_TRANS_QM_DQALLOC            30
+#define XFS_TRANS_QM_SETQLIM            31
+#define XFS_TRANS_QM_DQCLUSTER          32
+#define XFS_TRANS_QM_QINOCREATE         33
+#define XFS_TRANS_QM_QUOTAOFF_END       34
+#define XFS_TRANS_SB_UNIT               35
+#define XFS_TRANS_FSYNC_TS              36
+#define XFS_TRANS_GROWFSRT_ALLOC        37
+#define XFS_TRANS_GROWFSRT_ZERO         38
+#define XFS_TRANS_GROWFSRT_FREE         39
+#define XFS_TRANS_SWAPEXT               40
+/* new transaction types need to be reflected in xfs_logprint(8) */
+#ifdef __KERNEL__
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+typedef struct xfs_ail_entry {
+        struct xfs_log_item     *ail_forw;      /* AIL forw pointer */
+        struct xfs_log_item     *ail_back;      /* AIL back pointer */
+} xfs_ail_entry_t;
+/*
+ * This structure is passed as a parameter to xfs_trans_push_ail()
+ * and is used to track the what LSN the waiting processes are
+ * waiting to become unused.
+ */
+typedef struct xfs_ail_ticket {
+        xfs_lsn_t               at_lsn;         /* lsn waitin for */
+        struct xfs_ail_ticket   *at_forw;       /* wait list ptr */
+        struct xfs_ail_ticket   *at_back;       /* wait list ptr */
+        sv_t                    at_sema;        /* wait sema */
+} xfs_ail_ticket_t;
+typedef struct xfs_log_item {
+        xfs_ail_entry_t                 li_ail;         /* AIL pointers */
+        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
+        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
+        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
+        uint                            li_type;        /* item type */
+        uint                            li_flags;       /* misc flags */
+        struct xfs_log_item             *li_bio_list;   /* buffer item list */
+        void                            (*li_cb)(struct xfs_buf *,
+                                                 struct xfs_log_item *);
+                                                        /* buffer item iodone */
+                                                        /* callback func */
+        struct xfs_item_ops             *li_ops;        /* function list */
+} xfs_log_item_t;
+#define XFS_LI_IN_AIL   0x1
+#define XFS_LI_ABORTED  0x2
+typedef struct xfs_item_ops {
+        uint (*iop_size)(xfs_log_item_t *);
+        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+        void (*iop_pin)(xfs_log_item_t *);
+        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+        uint (*iop_trylock)(xfs_log_item_t *);
+        void (*iop_unlock)(xfs_log_item_t *);
+        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+        void (*iop_push)(xfs_log_item_t *);
+        void (*iop_abort)(xfs_log_item_t *);
+        void (*iop_pushbuf)(xfs_log_item_t *);
+        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+#define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
+#define IOP_ABORT(ip)           (*(ip)->li_ops->iop_abort)(ip)
+#define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS        0
+#define XFS_ITEM_PINNED         1
+#define XFS_ITEM_LOCKED         2
+#define XFS_ITEM_FLUSHING       3
+#define XFS_ITEM_PUSHBUF        4
+#endif  /* __KERNEL__ */
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+typedef struct xfs_log_item_desc {
+        xfs_log_item_t  *lid_item;
+        ushort          lid_size;
+        unsigned char   lid_flags;
+        unsigned char   lid_index;
+} xfs_log_item_desc_t;
+#define XFS_LID_DIRTY           0x1
+#define XFS_LID_PINNED          0x2
+#define XFS_LID_BUF_STALE       0x8
+/*
+ * This structure is used to maintain a chunk list of log_item_desc
+ * structures. The free field is a bitmask indicating which descriptors
+ * in this chunk's array are free.  The unused field is the first value
+ * not used since this chunk was allocated.
+ */
+#define XFS_LIC_NUM_SLOTS       15
+typedef struct xfs_log_item_chunk {
+        struct xfs_log_item_chunk       *lic_next;
+        ushort                          lic_free;
+        ushort                          lic_unused;
+        xfs_log_item_desc_t             lic_descs[XFS_LIC_NUM_SLOTS];
+} xfs_log_item_chunk_t;
+#define XFS_LIC_MAX_SLOT        (XFS_LIC_NUM_SLOTS - 1)
+#define XFS_LIC_FREEMASK        ((1 << XFS_LIC_NUM_SLOTS) - 1)
+/*
+ * Initialize the given chunk.  Set the chunk's free descriptor mask
+ * to indicate that all descriptors are free.  The caller gets to set
+ * lic_unused to the right value (0 matches all free).  The
+ * lic_descs.lid_index values are set up as each desc is allocated.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT)
+void xfs_lic_init(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_INIT(cp)        xfs_lic_init(cp)
+#else
+#define XFS_LIC_INIT(cp)        ((cp)->lic_free = XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT_SLOT)
+void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_INIT_SLOT(cp,slot)      xfs_lic_init_slot(cp, slot)
+#else
+#define XFS_LIC_INIT_SLOT(cp,slot)      \
+        ((cp)->lic_descs[slot].lid_index = (unsigned char)(slot))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_VACANCY)
+int xfs_lic_vacancy(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_VACANCY(cp)             xfs_lic_vacancy(cp)
+#else
+#define XFS_LIC_VACANCY(cp)             (((cp)->lic_free) & XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ALL_FREE)
+void xfs_lic_all_free(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_ALL_FREE(cp)            xfs_lic_all_free(cp)
+#else
+#define XFS_LIC_ALL_FREE(cp)            ((cp)->lic_free = XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ARE_ALL_FREE)
+int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_ARE_ALL_FREE(cp)        xfs_lic_are_all_free(cp)
+#else
+#define XFS_LIC_ARE_ALL_FREE(cp)        (((cp)->lic_free & XFS_LIC_FREEMASK) ==\
+                                        XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ISFREE)
+int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
+#else
+#define XFS_LIC_ISFREE(cp,slot) ((cp)->lic_free & (1 << (slot)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_CLAIM)
+void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_CLAIM(cp,slot)          xfs_lic_claim(cp,slot)
+#else
+#define XFS_LIC_CLAIM(cp,slot)          ((cp)->lic_free &= ~(1 << (slot)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_RELSE)
+void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_RELSE(cp,slot)          xfs_lic_relse(cp,slot)
+#else
+#define XFS_LIC_RELSE(cp,slot)          ((cp)->lic_free |= 1 << (slot))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_SLOT)
+xfs_log_item_desc_t *xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_SLOT(cp,slot)           xfs_lic_slot(cp,slot)
+#else
+#define XFS_LIC_SLOT(cp,slot)           (&((cp)->lic_descs[slot]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_SLOT)
+int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp);
+#define XFS_LIC_DESC_TO_SLOT(dp)        xfs_lic_desc_to_slot(dp)
+#else
+#define XFS_LIC_DESC_TO_SLOT(dp)        ((uint)((dp)->lid_index))
+#endif
+/*
+ * Calculate the address of a chunk given a descriptor pointer:
+ * dp - dp->lid_index give the address of the start of the lic_descs array.
+ * From this we subtract the offset of the lic_descs field in a chunk.
+ * All of this yields the address of the chunk, which is
+ * cast to a chunk pointer.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_CHUNK)
+xfs_log_item_chunk_t *xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp);
+#define XFS_LIC_DESC_TO_CHUNK(dp)       xfs_lic_desc_to_chunk(dp)
+#else
+#define XFS_LIC_DESC_TO_CHUNK(dp)       ((xfs_log_item_chunk_t*) \
+                                        (((xfs_caddr_t)((dp) - (dp)->lid_index)) -\
+                                        (xfs_caddr_t)(((xfs_log_item_chunk_t*) \
+                                        0)->lic_descs)))
+#endif
+#ifdef __KERNEL__
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+typedef struct xfs_log_busy_slot {
+        xfs_agnumber_t          lbc_ag;
+        ushort                  lbc_idx;        /* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+#define XFS_LBC_NUM_SLOTS       31
+typedef struct xfs_log_busy_chunk {
+        struct xfs_log_busy_chunk       *lbc_next;
+        uint                            lbc_free;       /* bitmask of free slots */
+        ushort                          lbc_unused;     /* first unused */
+        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
+#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
+#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
+#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+        unsigned int            t_magic;        /* magic number */
+        xfs_log_callback_t      t_logcb;        /* log callback struct */
+        struct xfs_trans        *t_forw;        /* async list pointers */
+        struct xfs_trans        *t_back;        /* async list pointers */
+        unsigned int            t_type;         /* transaction type */
+        unsigned int            t_log_res;      /* amt of log space resvd */
+        unsigned int            t_log_count;    /* count for perm log res */
+        unsigned int            t_blk_res;      /* # of blocks resvd */
+        unsigned int            t_blk_res_used; /* # of resvd blocks used */
+        unsigned int            t_rtx_res;      /* # of rt extents resvd */
+        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
+        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
+        sema_t                  t_sema;         /* sema for commit completion */
+        xfs_lsn_t               t_lsn;          /* log seq num of start of
+                                                 * transaction. */
+        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
+                                                 * transaction. */
+        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
+        struct xfs_dquot_acct   *t_dqinfo;      /* accting info for dquots */
+        xfs_trans_callback_t    t_callback;     /* transaction callback */
+        void                    *t_callarg;     /* callback arg */
+        unsigned int            t_flags;        /* misc flags */
+        long                    t_icount_delta; /* superblock icount change */
+        long                    t_ifree_delta;  /* superblock ifree change */
+        long                    t_fdblocks_delta; /* superblock fdblocks chg */
+        long                    t_res_fdblocks_delta; /* on-disk only chg */
+        long                    t_frextents_delta;/* superblock freextents chg*/
+        long                    t_res_frextents_delta; /* on-disk only chg */
+        long                    t_ag_freeblks_delta; /* debugging counter */
+        long                    t_ag_flist_delta; /* debugging counter */
+        long                    t_ag_btree_delta; /* debugging counter */
+        long                    t_dblocks_delta;/* superblock dblocks change */
+        long                    t_agcount_delta;/* superblock agcount change */
+        long                    t_imaxpct_delta;/* superblock imaxpct change */
+        long                    t_rextsize_delta;/* superblock rextsize chg */
+        long                    t_rbmblocks_delta;/* superblock rbmblocks chg */
+        long                    t_rblocks_delta;/* superblock rblocks change */
+        long                    t_rextents_delta;/* superblocks rextents chg */
+        long                    t_rextslog_delta;/* superblocks rextslog chg */
+        unsigned int            t_items_free;   /* log item descs free */
+        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
+        xfs_trans_header_t      t_header;       /* header for in-log trans */
+        unsigned int            t_busy_free;    /* busy descs free */
+        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
+        xfs_pflags_t            t_pflags;       /* saved pflags state */
+} xfs_trans_t;
+#endif  /* __KERNEL__ */
+#define XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
+/*
+ * Values for t_flags.
+ */
+#define XFS_TRANS_DIRTY         0x01    /* something needs to be logged */
+#define XFS_TRANS_SB_DIRTY      0x02    /* superblock is modified */
+#define XFS_TRANS_PERM_LOG_RES  0x04    /* xact took a permanent log res */
+#define XFS_TRANS_SYNC          0x08    /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY      0x10    /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE       0x20    /* OK to use reserved data blocks */
+/*
+ * Values for call flags parameter.
+ */
+#define XFS_TRANS_NOSLEEP               0x1
+#define XFS_TRANS_WAIT                  0x2
+#define XFS_TRANS_RELEASE_LOG_RES       0x4
+#define XFS_TRANS_ABORT                 0x8
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT             0x00000001
+#define XFS_TRANS_SB_IFREE              0x00000002
+#define XFS_TRANS_SB_FDBLOCKS           0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS       0x00000008
+#define XFS_TRANS_SB_FREXTENTS          0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS      0x00000020
+#define XFS_TRANS_SB_DBLOCKS            0x00000040
+#define XFS_TRANS_SB_AGCOUNT            0x00000080
+#define XFS_TRANS_SB_IMAXPCT            0x00000100
+#define XFS_TRANS_SB_REXTSIZE           0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS          0x00000400
+#define XFS_TRANS_SB_RBLOCKS            0x00000800
+#define XFS_TRANS_SB_REXTENTS           0x00001000
+#define XFS_TRANS_SB_REXTSLOG           0x00002000
+/*
+ * Various log reservation values.
+ * These are based on the size of the file system block
+ * because that is what most transactions manipulate.
+ * Each adds in an additional 128 bytes per item logged to
+ * try to account for the overhead of the transaction mechanism.
+ *
+ * Note:
+ * Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish()
+ * call.  This is because the number in the worst case is quite high
+ * and quite unusual.  In order to fix this we need to change
+ * xfs_bmap_finish() to free extents in only a single AG at a time.
+ * This will require changes to the EFI code as well, however, so that
+ * the EFI for the extents not freed is logged again in each transaction.
+ * See bug 261917.
+ */
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
+        ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+        ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define XFS_DIROP_LOG_RES(mp)   \
+        (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+         (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define XFS_DIROP_LOG_COUNT(mp) \
+        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode\'s bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_WRITE_LOG_RES(mp) \
+        (MAX( \
+         ((mp)->m_sb.sb_inodesize + \
+          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
+          (2 * (mp)->m_sb.sb_sectsize) + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+          (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
+         ((2 * (mp)->m_sb.sb_sectsize) + \
+          (2 * (mp)->m_sb.sb_sectsize) + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+#define XFS_WRITE_LOG_RES(mp)   ((mp)->m_reservations.tr_write)
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode\'s bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
+        (MAX( \
+         ((mp)->m_sb.sb_inodesize + \
+          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
+          (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
+         ((4 * (mp)->m_sb.sb_sectsize) + \
+          (4 * (mp)->m_sb.sb_sectsize) + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
+          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
+          (128 * 5) + \
+          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+           (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+            XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+#define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *      of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_RENAME_LOG_RES(mp) \
+        (MAX( \
+         ((4 * (mp)->m_sb.sb_inodesize) + \
+          (2 * XFS_DIROP_LOG_RES(mp)) + \
+          (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
+         ((3 * (mp)->m_sb.sb_sectsize) + \
+          (3 * (mp)->m_sb.sb_sectsize) + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 3) + \
+          (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
+#define XFS_RENAME_LOG_RES(mp)  ((mp)->m_reservations.tr_rename)
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_LINK_LOG_RES(mp) \
+        (MAX( \
+         ((mp)->m_sb.sb_inodesize + \
+          (mp)->m_sb.sb_inodesize + \
+          XFS_DIROP_LOG_RES(mp) + \
+          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
+         ((mp)->m_sb.sb_sectsize + \
+          (mp)->m_sb.sb_sectsize + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+          (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+#define XFS_LINK_LOG_RES(mp)    ((mp)->m_reservations.tr_link)
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_REMOVE_LOG_RES(mp)     \
+        (MAX( \
+         ((mp)->m_sb.sb_inodesize + \
+          (mp)->m_sb.sb_inodesize + \
+          XFS_DIROP_LOG_RES(mp) + \
+          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
+         ((2 * (mp)->m_sb.sb_sectsize) + \
+          (2 * (mp)->m_sb.sb_sectsize) + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+#define XFS_REMOVE_LOG_RES(mp)  ((mp)->m_reservations.tr_remove)
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 KB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_SYMLINK_LOG_RES(mp)            \
+        (MAX( \
+         ((mp)->m_sb.sb_inodesize + \
+          (mp)->m_sb.sb_inodesize + \
+          XFS_FSB_TO_B(mp, 1) + \
+          XFS_DIROP_LOG_RES(mp) + \
+          1024 + \
+          (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
+         (2 * (mp)->m_sb.sb_sectsize + \
+          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
+          XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+#define XFS_CALC_CREATE_LOG_RES(mp)             \
+        (MAX( \
+         ((mp)->m_sb.sb_inodesize + \
+          (mp)->m_sb.sb_inodesize + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_FSB_TO_B(mp, 1) + \
+          XFS_DIROP_LOG_RES(mp) + \
+          (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
+         (3 * (mp)->m_sb.sb_sectsize + \
+          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
+          XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+#define XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+#define XFS_CALC_MKDIR_LOG_RES(mp)      XFS_CALC_CREATE_LOG_RES(mp)
+#define XFS_MKDIR_LOG_RES(mp)   ((mp)->m_reservations.tr_mkdir)
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+#define XFS_CALC_IFREE_LOG_RES(mp) \
+        ((mp)->m_sb.sb_inodesize + \
+         (mp)->m_sb.sb_sectsize + \
+         (mp)->m_sb.sb_sectsize + \
+         XFS_FSB_TO_B((mp), 1) + \
+         MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
+         (128 * 5) + \
+          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+#define XFS_IFREE_LOG_RES(mp)   ((mp)->m_reservations.tr_ifree)
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+#define XFS_CALC_ICHANGE_LOG_RES(mp)    ((mp)->m_sb.sb_inodesize + \
+                                         (mp)->m_sb.sb_sectsize + 512)
+#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
+/*
+ * Growing the data section of the filesystem.
+ *      superblock
+ *      agi and agf
+ *      allocation btrees
+ */
+#define XFS_CALC_GROWDATA_LOG_RES(mp) \
+        ((mp)->m_sb.sb_sectsize * 3 + \
+         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+         (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+#define XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *      superblock: sector size
+ *      agf of the ag from which the extent is allocated: sector size
+ *      bmap btree for bitmap/summary inode: max depth * blocksize
+ *      bitmap/summary inode: inode size
+ *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
+        (2 * (mp)->m_sb.sb_sectsize + \
+         XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
+         (mp)->m_sb.sb_inodesize + \
+         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+         (128 * \
+          (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
+           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+#define XFS_GROWRTALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_growrtalloc)
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *      one bitmap/summary block: blocksize
+ */
+#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
+        ((mp)->m_sb.sb_blocksize + 128)
+#define XFS_GROWRTZERO_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtzero)
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *      superblock: sector size
+ *      bitmap inode: inode size
+ *      summary inode: inode size
+ *      one bitmap block: blocksize
+ *      summary blocks: new summary size
+ */
+#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
+        ((mp)->m_sb.sb_sectsize + \
+         2 * (mp)->m_sb.sb_inodesize + \
+         (mp)->m_sb.sb_blocksize + \
+         (mp)->m_rsumsize + \
+         (128 * 5))
+#define XFS_GROWRTFREE_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtfree)
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *      inode
+ */
+#define XFS_CALC_SWRITE_LOG_RES(mp) \
+        ((mp)->m_sb.sb_inodesize + 128)
+#define XFS_SWRITE_LOG_RES(mp)  ((mp)->m_reservations.tr_swrite)
+/*
+ * Logging the inode timestamps on an fsync -- same as SWRITE
+ * as long as SWRITE logs the entire inode core
+ */
+#define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *      inode
+ */
+#define XFS_CALC_WRITEID_LOG_RES(mp) \
+        ((mp)->m_sb.sb_inodesize + 128)
+#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+/*
+ * Converting the inode from non-attributed to attributed.
+ *      the inode being converted: inode size
+ *      agf block and superblock (for block allocation)
+ *      the new block (directory sized)
+ *      bmap blocks for the new directory block
+ *      allocation btrees
+ */
+#define XFS_CALC_ADDAFORK_LOG_RES(mp)   \
+        ((mp)->m_sb.sb_inodesize + \
+         (mp)->m_sb.sb_sectsize * 2 + \
+         (mp)->m_dirblksize + \
+         (XFS_DIR_IS_V1(mp) ? 0 : \
+            XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \
+         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+         (128 * (4 + \
+                 (XFS_DIR_IS_V1(mp) ? 0 : \
+                         XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
+                 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+#define XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode\'s bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ATTRINVAL_LOG_RES(mp)  \
+        (MAX( \
+         ((mp)->m_sb.sb_inodesize + \
+          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
+          (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
+         ((4 * (mp)->m_sb.sb_sectsize) + \
+          (4 * (mp)->m_sb.sb_sectsize) + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
+          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
+#define XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
+/*
+ * Setting an attribute.
+ *      the inode getting the attribute
+ *      the superblock for allocations
+ *      the agfs extents are allocated from
+ *      the attribute btree * max depth
+ *      the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
+#define XFS_CALC_ATTRSET_LOG_RES(mp)    \
+        ((mp)->m_sb.sb_inodesize + \
+         (mp)->m_sb.sb_sectsize + \
+          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
+          (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
+#define XFS_ATTRSET_LOG_RES(mp, ext)    \
+        ((mp)->m_reservations.tr_attrset + \
+         (ext * (mp)->m_sb.sb_sectsize) + \
+         (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
+         (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ATTRRM_LOG_RES(mp)     \
+        (MAX( \
+          ((mp)->m_sb.sb_inodesize + \
+          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
+          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
+          (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
+         ((2 * (mp)->m_sb.sb_sectsize) + \
+          (2 * (mp)->m_sb.sb_sectsize) + \
+          (mp)->m_sb.sb_sectsize + \
+          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+#define XFS_ATTRRM_LOG_RES(mp)  ((mp)->m_reservations.tr_attrrm)
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
+        ((mp)->m_sb.sb_sectsize + 128)
+#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
+/*
+ * Various log count values.
+ */
+#define XFS_DEFAULT_LOG_COUNT           1
+#define XFS_DEFAULT_PERM_LOG_COUNT      2
+#define XFS_ITRUNCATE_LOG_COUNT         2
+#define XFS_INACTIVE_LOG_COUNT          2
+#define XFS_CREATE_LOG_COUNT            2
+#define XFS_MKDIR_LOG_COUNT             3
+#define XFS_SYMLINK_LOG_COUNT           3
+#define XFS_REMOVE_LOG_COUNT            2
+#define XFS_LINK_LOG_COUNT              2
+#define XFS_RENAME_LOG_COUNT            2
+#define XFS_WRITE_LOG_COUNT             2
+#define XFS_ADDAFORK_LOG_COUNT          2
+#define XFS_ATTRINVAL_LOG_COUNT         1
+#define XFS_ATTRSET_LOG_COUNT           3
+#define XFS_ATTRRM_LOG_COUNT            3
+/*
+ * Here we centralize the specification of XFS meta-data buffer
+ * reference count values.  This determine how hard the buffer
+ * cache tries to hold onto the buffer.
+ */
+#define XFS_AGF_REF             4
+#define XFS_AGI_REF             4
+#define XFS_AGFL_REF            3
+#define XFS_INO_BTREE_REF       3
+#define XFS_ALLOC_BTREE_REF     2
+#define XFS_BMAP_BTREE_REF      2
+#define XFS_DIR_BTREE_REF       2
+#define XFS_ATTR_BTREE_REF      1
+#define XFS_INO_REF             1
+#define XFS_DQUOT_REF           1
+#ifdef __KERNEL__
+/*
+ * XFS transaction mechanism exported interfaces that are
+ * actually macros.
+ */
+#define xfs_trans_get_log_res(tp)       ((tp)->t_log_res)
+#define xfs_trans_get_log_count(tp)     ((tp)->t_log_count)
+#define xfs_trans_get_block_res(tp)     ((tp)->t_blk_res)
+#define xfs_trans_set_sync(tp)          ((tp)->t_flags |= XFS_TRANS_SYNC)
+#ifdef DEBUG
+#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d)
+#define xfs_trans_agflist_delta(tp, d)  ((tp)->t_ag_flist_delta += (long)d)
+#define xfs_trans_agbtree_delta(tp, d)  ((tp)->t_ag_btree_delta += (long)d)
+#else
+#define xfs_trans_agblocks_delta(tp, d)
+#define xfs_trans_agflist_delta(tp, d)
+#define xfs_trans_agbtree_delta(tp, d)
+#endif
+/*
+ * XFS transaction mechanism exported interfaces.
+ */
+void            xfs_trans_init(struct xfs_mount *);
+xfs_trans_t     *xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t     *_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t     *xfs_trans_dup(xfs_trans_t *);
+int             xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+                                  uint, uint);
+void            xfs_trans_callback(xfs_trans_t *,
+                                   void (*)(xfs_trans_t *, void *), void *);
+void            xfs_trans_mod_sb(xfs_trans_t *, uint, long);
+struct xfs_buf  *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t,
+                                   int, uint);
+int             xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *,
+                                   struct xfs_buftarg *, xfs_daddr_t, int, uint,
+                                   struct xfs_buf **);
+struct xfs_buf  *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+void            xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
+                               xfs_ino_t , uint, uint, struct xfs_inode **);
+void            xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
+void            xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
+void            xfs_trans_ihold_release(xfs_trans_t *, struct xfs_inode *);
+void            xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void            xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
+void            xfs_efi_release(struct xfs_efi_log_item *, uint);
+void            xfs_trans_log_efi_extent(xfs_trans_t *,
+                                         struct xfs_efi_log_item *,
+                                         xfs_fsblock_t,
+                                         xfs_extlen_t);
+struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
+                                  struct xfs_efi_log_item *,
+                                  uint);
+void            xfs_trans_log_efd_extent(xfs_trans_t *,
+                                         struct xfs_efd_log_item *,
+                                         xfs_fsblock_t,
+                                         xfs_extlen_t);
+int             xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *);
+void            xfs_trans_cancel(xfs_trans_t *, int);
+void            xfs_trans_ail_init(struct xfs_mount *);
+xfs_lsn_t       xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
+xfs_lsn_t       xfs_trans_tail_ail(struct xfs_mount *);
+void            xfs_trans_unlocked_item(struct xfs_mount *,
+                                        xfs_log_item_t *);
+xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
+                                        xfs_agnumber_t ag,
+                                        xfs_extlen_t idx);
+#endif  /* __KERNEL__ */
+#endif  /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
new file mode 100644
index 000000000000..7bc5eab4c2c1
--- /dev/null
+++ b/fs/xfs/xfs_trans_ail.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+#ifdef DEBUG
+STATIC void xfs_ail_check(xfs_ail_entry_t *);
+#else
+#define xfs_ail_check(a)
+#endif /* DEBUG */
+/*
+ * This is called by the log manager code to determine the LSN
+ * of the tail of the log.  This is exactly the LSN of the first
+ * item in the AIL.  If the AIL is empty, then this function
+ * returns 0.
+ *
+ * We need the AIL lock in order to get a coherent read of the
+ * lsn of the last item in the AIL.
+ */
+xfs_lsn_t
+xfs_trans_tail_ail(
+        xfs_mount_t     *mp)
+{
+        xfs_lsn_t       lsn;
+        xfs_log_item_t  *lip;
+        SPLDECL(s);
+        AIL_LOCK(mp,s);
+        lip = xfs_ail_min(&(mp->m_ail));
+        if (lip == NULL) {
+                lsn = (xfs_lsn_t)0;
+        } else {
+                lsn = lip->li_lsn;
+        }
+        AIL_UNLOCK(mp, s);
+        return lsn;
+}
+/*
+ * xfs_trans_push_ail
+ *
+ * This routine is called to move the tail of the AIL
+ * forward.  It does this by trying to flush items in the AIL
+ * whose lsns are below the given threshold_lsn.
+ *
+ * The routine returns the lsn of the tail of the log.
+ */
+xfs_lsn_t
+xfs_trans_push_ail(
+        xfs_mount_t             *mp,
+        xfs_lsn_t               threshold_lsn)
+{
+        xfs_lsn_t               lsn;
+        xfs_log_item_t          *lip;
+        int                     gen;
+        int                     restarts;
+        int                     lock_result;
+        int                     flush_log;
+        SPLDECL(s);
+#define XFS_TRANS_PUSH_AIL_RESTARTS     10
+        AIL_LOCK(mp,s);
+        lip = xfs_trans_first_ail(mp, &gen);
+        if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) {
+                /*
+                 * Just return if the AIL is empty.
+                 */
+                AIL_UNLOCK(mp, s);
+                return (xfs_lsn_t)0;
+        }
+        XFS_STATS_INC(xs_push_ail);
+        /*
+         * While the item we are looking at is below the given threshold
+         * try to flush it out.  Make sure to limit the number of times
+         * we allow xfs_trans_next_ail() to restart scanning from the
+         * beginning of the list.  We'd like not to stop until we've at least
+         * tried to push on everything in the AIL with an LSN less than
+         * the given threshold. However, we may give up before that if
+         * we realize that we've been holding the AIL_LOCK for 'too long',
+         * blocking interrupts. Currently, too long is < 500us roughly.
+         */
+        flush_log = 0;
+        restarts = 0;
+        while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) &&
+                (XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) {
+                /*
+                 * If we can lock the item without sleeping, unlock
+                 * the AIL lock and flush the item.  Then re-grab the
+                 * AIL lock so we can look for the next item on the
+                 * AIL.  Since we unlock the AIL while we flush the
+                 * item, the next routine may start over again at the
+                 * the beginning of the list if anything has changed.
+                 * That is what the generation count is for.
+                 *
+                 * If we can't lock the item, either its holder will flush
+                 * it or it is already being flushed or it is being relogged.
+                 * In any of these case it is being taken care of and we
+                 * can just skip to the next item in the list.
+                 */
+                lock_result = IOP_TRYLOCK(lip);
+                switch (lock_result) {
+                      case XFS_ITEM_SUCCESS:
+                        AIL_UNLOCK(mp, s);
+                        XFS_STATS_INC(xs_push_ail_success);
+                        IOP_PUSH(lip);
+                        AIL_LOCK(mp,s);
+                        break;
+                      case XFS_ITEM_PUSHBUF:
+                        AIL_UNLOCK(mp, s);
+                        XFS_STATS_INC(xs_push_ail_pushbuf);
+#ifdef XFSRACEDEBUG
+                        delay_for_intr();
+                        delay(300);
+#endif
+                        ASSERT(lip->li_ops->iop_pushbuf);
+                        ASSERT(lip);
+                        IOP_PUSHBUF(lip);
+                        AIL_LOCK(mp,s);
+                        break;
+                      case XFS_ITEM_PINNED:
+                        XFS_STATS_INC(xs_push_ail_pinned);
+                        flush_log = 1;
+                        break;
+                      case XFS_ITEM_LOCKED:
+                        XFS_STATS_INC(xs_push_ail_locked);
+                        break;
+                      case XFS_ITEM_FLUSHING:
+                        XFS_STATS_INC(xs_push_ail_flushing);
+                        break;
+                      default:
+                        ASSERT(0);
+                        break;
+                }
+                lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+                if (lip == NULL) {
+                        break;
+                }
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        /*
+                         * Just return if we shut down during the last try.
+                         */
+                        AIL_UNLOCK(mp, s);
+                        return (xfs_lsn_t)0;
+                }
+        }
+        if (flush_log) {
+                /*
+                 * If something we need to push out was pinned, then
+                 * push out the log so it will become unpinned and
+                 * move forward in the AIL.
+                 */
+                AIL_UNLOCK(mp, s);
+                XFS_STATS_INC(xs_push_ail_flush);
+                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                AIL_LOCK(mp, s);
+        }
+        lip = xfs_ail_min(&(mp->m_ail));
+        if (lip == NULL) {
+                lsn = (xfs_lsn_t)0;
+        } else {
+                lsn = lip->li_lsn;
+        }
+        AIL_UNLOCK(mp, s);
+        return lsn;
+}       /* xfs_trans_push_ail */
+/*
+ * This is to be called when an item is unlocked that may have
+ * been in the AIL.  It will wake up the first member of the AIL
+ * wait list if this item's unlocking might allow it to progress.
+ * If the item is in the AIL, then we need to get the AIL lock
+ * while doing our checking so we don't race with someone going
+ * to sleep waiting for this event in xfs_trans_push_ail().
+ */
+void
+xfs_trans_unlocked_item(
+        xfs_mount_t     *mp,
+        xfs_log_item_t  *lip)
+{
+        xfs_log_item_t  *min_lip;
+        /*
+         * If we're forcibly shutting down, we may have
+         * unlocked log items arbitrarily. The last thing
+         * we want to do is to move the tail of the log
+         * over some potentially valid data.
+         */
+        if (!(lip->li_flags & XFS_LI_IN_AIL) ||
+            XFS_FORCED_SHUTDOWN(mp)) {
+                return;
+        }
+        /*
+         * This is the one case where we can call into xfs_ail_min()
+         * without holding the AIL lock because we only care about the
+         * case where we are at the tail of the AIL.  If the object isn't
+         * at the tail, it doesn't matter what result we get back.  This
+         * is slightly racy because since we were just unlocked, we could
+         * go to sleep between the call to xfs_ail_min and the call to
+         * xfs_log_move_tail, have someone else lock us, commit to us disk,
+         * move us out of the tail of the AIL, and then we wake up.  However,
+         * the call to xfs_log_move_tail() doesn't do anything if there's
+         * not enough free space to wake people up so we're safe calling it.
+         */
+        min_lip = xfs_ail_min(&mp->m_ail);
+        if (min_lip == lip)
+                xfs_log_move_tail(mp, 1);
+}       /* xfs_trans_unlocked_item */
+/*
+ * Update the position of the item in the AIL with the new
+ * lsn.  If it is not yet in the AIL, add it.  Otherwise, move
+ * it to its new position by removing it and re-adding it.
+ *
+ * Wakeup anyone with an lsn less than the item's lsn.  If the item
+ * we move in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
+ *
+ * Increment the AIL's generation count to indicate that the tree
+ * has changed.
+ *
+ * This function must be called with the AIL lock held.  The lock
+ * is dropped before returning, so the caller must pass in the
+ * cookie returned by AIL_LOCK.
+ */
+void
+xfs_trans_update_ail(
+        xfs_mount_t     *mp,
+        xfs_log_item_t  *lip,
+        xfs_lsn_t       lsn,
+        unsigned long   s)
+{
+        xfs_ail_entry_t         *ailp;
+        xfs_log_item_t          *dlip=NULL;
+        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
+        ailp = &(mp->m_ail);
+        mlip = xfs_ail_min(ailp);
+        if (lip->li_flags & XFS_LI_IN_AIL) {
+                dlip = xfs_ail_delete(ailp, lip);
+                ASSERT(dlip == lip);
+        } else {
+                lip->li_flags |= XFS_LI_IN_AIL;
+        }
+        lip->li_lsn = lsn;
+        xfs_ail_insert(ailp, lip);
+        mp->m_ail_gen++;
+        if (mlip == dlip) {
+                mlip = xfs_ail_min(&(mp->m_ail));
+                AIL_UNLOCK(mp, s);
+                xfs_log_move_tail(mp, mlip->li_lsn);
+        } else {
+                AIL_UNLOCK(mp, s);
+        }
+}       /* xfs_trans_update_ail */
+/*
+ * Delete the given item from the AIL.  It must already be in
+ * the AIL.
+ *
+ * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * we delete in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
+ *
+ * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * bump the AIL's generation count to indicate that the tree
+ * has changed.
+ *
+ * This function must be called with the AIL lock held.  The lock
+ * is dropped before returning, so the caller must pass in the
+ * cookie returned by AIL_LOCK.
+ */
+void
+xfs_trans_delete_ail(
+        xfs_mount_t     *mp,
+        xfs_log_item_t  *lip,
+        unsigned long   s)
+{
+        xfs_ail_entry_t         *ailp;
+        xfs_log_item_t          *dlip;
+        xfs_log_item_t          *mlip;
+        if (lip->li_flags & XFS_LI_IN_AIL) {
+                ailp = &(mp->m_ail);
+                mlip = xfs_ail_min(ailp);
+                dlip = xfs_ail_delete(ailp, lip);
+                ASSERT(dlip == lip);
+                lip->li_flags &= ~XFS_LI_IN_AIL;
+                lip->li_lsn = 0;
+                mp->m_ail_gen++;
+                if (mlip == dlip) {
+                        mlip = xfs_ail_min(&(mp->m_ail));
+                        AIL_UNLOCK(mp, s);
+                        xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+                } else {
+                        AIL_UNLOCK(mp, s);
+                }
+        }
+        else {
+                /*
+                 * If the file system is not being shutdown, we are in
+                 * serious trouble if we get to this stage.
+                 */
+                if (XFS_FORCED_SHUTDOWN(mp))
+                        AIL_UNLOCK(mp, s);
+                else {
+                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                                "xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
+                        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+                        AIL_UNLOCK(mp, s);
+                }
+        }
+}
+/*
+ * Return the item in the AIL with the smallest lsn.
+ * Return the current tree generation number for use
+ * in calls to xfs_trans_next_ail().
+ */
+xfs_log_item_t *
+xfs_trans_first_ail(
+        xfs_mount_t     *mp,
+        int             *gen)
+{
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(&(mp->m_ail));
+        *gen = (int)mp->m_ail_gen;
+        return (lip);
+}
+/*
+ * If the generation count of the tree has not changed since the
+ * caller last took something from the AIL, then return the elmt
+ * in the tree which follows the one given.  If the count has changed,
+ * then return the minimum elmt of the AIL and bump the restarts counter
+ * if one is given.
+ */
+xfs_log_item_t *
+xfs_trans_next_ail(
+        xfs_mount_t     *mp,
+        xfs_log_item_t  *lip,
+        int             *gen,
+        int             *restarts)
+{
+        xfs_log_item_t  *nlip;
+        ASSERT(mp && lip && gen);
+        if (mp->m_ail_gen == *gen) {
+                nlip = xfs_ail_next(&(mp->m_ail), lip);
+        } else {
+                nlip = xfs_ail_min(&(mp->m_ail));
+                *gen = (int)mp->m_ail_gen;
+                if (restarts != NULL) {
+                        XFS_STATS_INC(xs_push_ail_restarts);
+                        (*restarts)++;
+                }
+        }
+        return (nlip);
+}
+/*
+ * The active item list (AIL) is a doubly linked list of log
+ * items sorted by ascending lsn.  The base of the list is
+ * a forw/back pointer pair embedded in the xfs mount structure.
+ * The base is initialized with both pointers pointing to the
+ * base.  This case always needs to be distinguished, because
+ * the base has no lsn to look at.  We almost always insert
+ * at the end of the list, so on inserts we search from the
+ * end of the list to find where the new item belongs.
+ */
+/*
+ * Initialize the doubly linked list to point only to itself.
+ */
+void
+xfs_trans_ail_init(
+        xfs_mount_t     *mp)
+{
+        mp->m_ail.ail_forw = (xfs_log_item_t*)&(mp->m_ail);
+        mp->m_ail.ail_back = (xfs_log_item_t*)&(mp->m_ail);
+}
+/*
+ * Insert the given log item into the AIL.
+ * We almost always insert at the end of the list, so on inserts
+ * we search from the end of the list to find where the
+ * new item belongs.
+ */
+STATIC void
+xfs_ail_insert(
+        xfs_ail_entry_t *base,
+        xfs_log_item_t  *lip)
+/* ARGSUSED */
+{
+        xfs_log_item_t  *next_lip;
+        /*
+         * If the list is empty, just insert the item.
+         */
+        if (base->ail_back == (xfs_log_item_t*)base) {
+                base->ail_forw = lip;
+                base->ail_back = lip;
+                lip->li_ail.ail_forw = (xfs_log_item_t*)base;
+                lip->li_ail.ail_back = (xfs_log_item_t*)base;
+                return;
+        }
+        next_lip = base->ail_back;
+        while ((next_lip != (xfs_log_item_t*)base) &&
+               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
+                next_lip = next_lip->li_ail.ail_back;
+        }
+        ASSERT((next_lip == (xfs_log_item_t*)base) ||
+               (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+        lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
+        lip->li_ail.ail_back = next_lip;
+        next_lip->li_ail.ail_forw = lip;
+        lip->li_ail.ail_forw->li_ail.ail_back = lip;
+        xfs_ail_check(base);
+        return;
+}
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+/*ARGSUSED*/
+STATIC xfs_log_item_t *
+xfs_ail_delete(
+        xfs_ail_entry_t *base,
+        xfs_log_item_t  *lip)
+/* ARGSUSED */
+{
+        lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
+        lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
+        lip->li_ail.ail_forw = NULL;
+        lip->li_ail.ail_back = NULL;
+        xfs_ail_check(base);
+        return lip;
+}
+/*
+ * Return a pointer to the first item in the AIL.
+ * If the AIL is empty, then return NULL.
+ */
+STATIC xfs_log_item_t *
+xfs_ail_min(
+        xfs_ail_entry_t *base)
+/* ARGSUSED */
+{
+        register xfs_log_item_t *forw = base->ail_forw;
+        if (forw == (xfs_log_item_t*)base) {
+                return NULL;
+        }
+        return forw;
+}
+/*
+ * Return a pointer to the item which follows
+ * the given item in the AIL.  If the given item
+ * is the last item in the list, then return NULL.
+ */
+STATIC xfs_log_item_t *
+xfs_ail_next(
+        xfs_ail_entry_t *base,
+        xfs_log_item_t  *lip)
+/* ARGSUSED */
+{
+        if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+                return NULL;
+        }
+        return lip->li_ail.ail_forw;
+}
+#ifdef DEBUG
+/*
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+        xfs_ail_entry_t *base)
+{
+        xfs_log_item_t  *lip;
+        xfs_log_item_t  *prev_lip;
+        lip = base->ail_forw;
+        if (lip == (xfs_log_item_t*)base) {
+                /*
+                 * Make sure the pointers are correct when the list
+                 * is empty.
+                 */
+                ASSERT(base->ail_back == (xfs_log_item_t*)base);
+                return;
+        }
+        /*
+         * Walk the list checking forward and backward pointers,
+         * lsn ordering, and that every entry has the XFS_LI_IN_AIL
+         * flag set.
+         */
+        prev_lip = (xfs_log_item_t*)base;
+        while (lip != (xfs_log_item_t*)base) {
+                if (prev_lip != (xfs_log_item_t*)base) {
+                        ASSERT(prev_lip->li_ail.ail_forw == lip);
+                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+                }
+                ASSERT(lip->li_ail.ail_back == prev_lip);
+                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+                prev_lip = lip;
+                lip = lip->li_ail.ail_forw;
+        }
+        ASSERT(lip == (xfs_log_item_t*)base);
+        ASSERT(base->ail_back == prev_lip);
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
new file mode 100644
index 000000000000..a9682b9510c1
--- /dev/null
+++ b/fs/xfs/xfs_trans_buf.c
@@ -0,0 +1,1093 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
+                xfs_daddr_t, int);
+STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
+                xfs_daddr_t, int);
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * Use the fast path function xfs_trans_buf_item_match() or the buffer
+ * cache routine incore_match() to find the buffer
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the buffer, use get_buf() to get it.
+ * If it doesn't yet have an associated xfs_buf_log_item structure,
+ * then allocate one and add the item to this transaction.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * get_buf() call.
+ */
+xfs_buf_t *
+xfs_trans_get_buf(xfs_trans_t   *tp,
+                  xfs_buftarg_t *target_dev,
+                  xfs_daddr_t   blkno,
+                  int           len,
+                  uint          flags)
+{
+        xfs_buf_t               *bp;
+        xfs_buf_log_item_t      *bip;
+        if (flags == 0)
+                flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+        /*
+         * Default to a normal get_buf() call if the tp is NULL.
+         */
+        if (tp == NULL) {
+                bp = xfs_buf_get_flags(target_dev, blkno, len,
+                                                        flags | BUF_BUSY);
+                return(bp);
+        }
+        /*
+         * If we find the buffer in the cache with this transaction
+         * pointer in its b_fsprivate2 field, then we know we already
+         * have it locked.  In this case we just increment the lock
+         * recursion count and return the buffer to the caller.
+         */
+        if (tp->t_items.lic_next == NULL) {
+                bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
+        } else {
+                bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
+        }
+        if (bp != NULL) {
+                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+                if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+                        xfs_buftrace("TRANS GET RECUR SHUT", bp);
+                        XFS_BUF_SUPER_STALE(bp);
+                }
+                /*
+                 * If the buffer is stale then it was binval'ed
+                 * since last read.  This doesn't matter since the
+                 * caller isn't allowed to use the data anyway.
+                 */
+                else if (XFS_BUF_ISSTALE(bp)) {
+                        xfs_buftrace("TRANS GET RECUR STALE", bp);
+                        ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+                }
+                ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+                bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+                ASSERT(bip != NULL);
+                ASSERT(atomic_read(&bip->bli_refcount) > 0);
+                bip->bli_recur++;
+                xfs_buftrace("TRANS GET RECUR", bp);
+                xfs_buf_item_trace("GET RECUR", bip);
+                return (bp);
+        }
+        /*
+         * We always specify the BUF_BUSY flag within a transaction so
+         * that get_buf does not try to push out a delayed write buffer
+         * which might cause another transaction to take place (if the
+         * buffer was delayed alloc).  Such recursive transactions can
+         * easily deadlock with our current transaction as well as cause
+         * us to run out of stack space.
+         */
+        bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY);
+        if (bp == NULL) {
+                return NULL;
+        }
+        ASSERT(!XFS_BUF_GETERROR(bp));
+        /*
+         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+         * it doesn't have one yet, then allocate one and initialize it.
+         * The checks to see if one is there are in xfs_buf_item_init().
+         */
+        xfs_buf_item_init(bp, tp->t_mountp);
+        /*
+         * Set the recursion count for the buffer within this transaction
+         * to 0.
+         */
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+        bip->bli_recur = 0;
+        /*
+         * Take a reference for this transaction on the buf item.
+         */
+        atomic_inc(&bip->bli_refcount);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+        /*
+         * Initialize b_fsprivate2 so we can find it with incore_match()
+         * above.
+         */
+        XFS_BUF_SET_FSPRIVATE2(bp, tp);
+        xfs_buftrace("TRANS GET", bp);
+        xfs_buf_item_trace("GET", bip);
+        return (bp);
+}
+/*
+ * Get and lock the superblock buffer of this file system for the
+ * given transaction.
+ *
+ * We don't need to use incore_match() here, because the superblock
+ * buffer is a private buffer which we keep a pointer to in the
+ * mount structure.
+ */
+xfs_buf_t *
+xfs_trans_getsb(xfs_trans_t     *tp,
+                struct xfs_mount *mp,
+                int             flags)
+{
+        xfs_buf_t               *bp;
+        xfs_buf_log_item_t      *bip;
+        /*
+         * Default to just trying to lock the superblock buffer
+         * if tp is NULL.
+         */
+        if (tp == NULL) {
+                return (xfs_getsb(mp, flags));
+        }
+        /*
+         * If the superblock buffer already has this transaction
+         * pointer in its b_fsprivate2 field, then we know we already
+         * have it locked.  In this case we just increment the lock
+         * recursion count and return the buffer to the caller.
+         */
+        bp = mp->m_sb_bp;
+        if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) {
+                bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+                ASSERT(bip != NULL);
+                ASSERT(atomic_read(&bip->bli_refcount) > 0);
+                bip->bli_recur++;
+                xfs_buf_item_trace("GETSB RECUR", bip);
+                return (bp);
+        }
+        bp = xfs_getsb(mp, flags);
+        if (bp == NULL) {
+                return NULL;
+        }
+        /*
+         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+         * it doesn't have one yet, then allocate one and initialize it.
+         * The checks to see if one is there are in xfs_buf_item_init().
+         */
+        xfs_buf_item_init(bp, mp);
+        /*
+         * Set the recursion count for the buffer within this transaction
+         * to 0.
+         */
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+        bip->bli_recur = 0;
+        /*
+         * Take a reference for this transaction on the buf item.
+         */
+        atomic_inc(&bip->bli_refcount);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+        /*
+         * Initialize b_fsprivate2 so we can find it with incore_match()
+         * above.
+         */
+        XFS_BUF_SET_FSPRIVATE2(bp, tp);
+        xfs_buf_item_trace("GETSB", bip);
+        return (bp);
+}
+#ifdef DEBUG
+xfs_buftarg_t *xfs_error_target;
+int     xfs_do_error;
+int     xfs_req_num;
+int     xfs_error_mod = 33;
+#endif
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.  If it has not yet been
+ * read in, read it from disk. If it is already locked
+ * within the transaction and already read in, just increment its
+ * lock recursion count and return a pointer to it.
+ *
+ * Use the fast path function xfs_trans_buf_item_match() or the buffer
+ * cache routine incore_match() to find the buffer
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the buffer, use read_buf() to get it.
+ * If it doesn't yet have an associated xfs_buf_log_item structure,
+ * then allocate one and add the item to this transaction.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * read_buf() call.
+ */
+int
+xfs_trans_read_buf(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_buftarg_t   *target,
+        xfs_daddr_t     blkno,
+        int             len,
+        uint            flags,
+        xfs_buf_t       **bpp)
+{
+        xfs_buf_t               *bp;
+        xfs_buf_log_item_t      *bip;
+        int                     error;
+        if (flags == 0)
+                flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+        /*
+         * Default to a normal get_buf() call if the tp is NULL.
+         */
+        if (tp == NULL) {
+                bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+                if (!bp)
+                        return XFS_ERROR(ENOMEM);
+                if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
+                                          bp, blkno);
+                        error = XFS_BUF_GETERROR(bp);
+                        xfs_buf_relse(bp);
+                        return error;
+                }
+#ifdef DEBUG
+                if (xfs_do_error && (bp != NULL)) {
+                        if (xfs_error_target == target) {
+                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
+                                        xfs_buf_relse(bp);
+                                        printk("Returning error!\n");
+                                        return XFS_ERROR(EIO);
+                                }
+                        }
+                }
+#endif
+                if (XFS_FORCED_SHUTDOWN(mp))
+                        goto shutdown_abort;
+                *bpp = bp;
+                return 0;
+        }
+        /*
+         * If we find the buffer in the cache with this transaction
+         * pointer in its b_fsprivate2 field, then we know we already
+         * have it locked.  If it is already read in we just increment
+         * the lock recursion count and return the buffer to the caller.
+         * If the buffer is not yet read in, then we read it in, increment
+         * the lock recursion count, and return it to the caller.
+         */
+        if (tp->t_items.lic_next == NULL) {
+                bp = xfs_trans_buf_item_match(tp, target, blkno, len);
+        } else {
+                bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
+        }
+        if (bp != NULL) {
+                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+                ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+                ASSERT((XFS_BUF_ISERROR(bp)) == 0);
+                if (!(XFS_BUF_ISDONE(bp))) {
+                        xfs_buftrace("READ_BUF_INCORE !DONE", bp);
+                        ASSERT(!XFS_BUF_ISASYNC(bp));
+                        XFS_BUF_READ(bp);
+                        xfsbdstrat(tp->t_mountp, bp);
+                        xfs_iowait(bp);
+                        if (XFS_BUF_GETERROR(bp) != 0) {
+                                xfs_ioerror_alert("xfs_trans_read_buf", mp,
+                                                  bp, blkno);
+                                error = XFS_BUF_GETERROR(bp);
+                                xfs_buf_relse(bp);
+                                /*
+                                 * We can gracefully recover from most
+                                 * read errors. Ones we can't are those
+                                 * that happen after the transaction's
+                                 * already dirty.
+                                 */
+                                if (tp->t_flags & XFS_TRANS_DIRTY)
+                                        xfs_force_shutdown(tp->t_mountp,
+                                                           XFS_METADATA_IO_ERROR);
+                                return error;
+                        }
+                }
+                /*
+                 * We never locked this buf ourselves, so we shouldn't
+                 * brelse it either. Just get out.
+                 */
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp);
+                        *bpp = NULL;
+                        return XFS_ERROR(EIO);
+                }
+                bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+                bip->bli_recur++;
+                ASSERT(atomic_read(&bip->bli_refcount) > 0);
+                xfs_buf_item_trace("READ RECUR", bip);
+                *bpp = bp;
+                return 0;
+        }
+        /*
+         * We always specify the BUF_BUSY flag within a transaction so
+         * that get_buf does not try to push out a delayed write buffer
+         * which might cause another transaction to take place (if the
+         * buffer was delayed alloc).  Such recursive transactions can
+         * easily deadlock with our current transaction as well as cause
+         * us to run out of stack space.
+         */
+        bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+        if (bp == NULL) {
+                *bpp = NULL;
+                return 0;
+        }
+        if (XFS_BUF_GETERROR(bp) != 0) {
+            XFS_BUF_SUPER_STALE(bp);
+                xfs_buftrace("READ ERROR", bp);
+                error = XFS_BUF_GETERROR(bp);
+                xfs_ioerror_alert("xfs_trans_read_buf", mp,
+                                  bp, blkno);
+                if (tp->t_flags & XFS_TRANS_DIRTY)
+                        xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+                xfs_buf_relse(bp);
+                return error;
+        }
+#ifdef DEBUG
+        if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
+                if (xfs_error_target == target) {
+                        if (((xfs_req_num++) % xfs_error_mod) == 0) {
+                                xfs_force_shutdown(tp->t_mountp,
+                                                   XFS_METADATA_IO_ERROR);
+                                xfs_buf_relse(bp);
+                                printk("Returning error in trans!\n");
+                                return XFS_ERROR(EIO);
+                        }
+                }
+        }
+#endif
+        if (XFS_FORCED_SHUTDOWN(mp))
+                goto shutdown_abort;
+        /*
+         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+         * it doesn't have one yet, then allocate one and initialize it.
+         * The checks to see if one is there are in xfs_buf_item_init().
+         */
+        xfs_buf_item_init(bp, tp->t_mountp);
+        /*
+         * Set the recursion count for the buffer within this transaction
+         * to 0.
+         */
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+        bip->bli_recur = 0;
+        /*
+         * Take a reference for this transaction on the buf item.
+         */
+        atomic_inc(&bip->bli_refcount);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+        /*
+         * Initialize b_fsprivate2 so we can find it with incore_match()
+         * above.
+         */
+        XFS_BUF_SET_FSPRIVATE2(bp, tp);
+        xfs_buftrace("TRANS READ", bp);
+        xfs_buf_item_trace("READ", bip);
+        *bpp = bp;
+        return 0;
+shutdown_abort:
+        /*
+         * the theory here is that buffer is good but we're
+         * bailing out because the filesystem is being forcibly
+         * shut down.  So we should leave the b_flags alone since
+         * the buffer's not staled and just get out.
+         */
+#if defined(DEBUG)
+        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
+                cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
+#endif
+        ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
+                                                (XFS_B_STALE|XFS_B_DELWRI));
+        xfs_buftrace("READ_BUF XFSSHUTDN", bp);
+        xfs_buf_relse(bp);
+        *bpp = NULL;
+        return XFS_ERROR(EIO);
+}
+/*
+ * Release the buffer bp which was previously acquired with one of the
+ * xfs_trans_... buffer allocation routines if the buffer has not
+ * been modified within this transaction.  If the buffer is modified
+ * within this transaction, do decrement the recursion count but do
+ * not release the buffer even if the count goes to 0.  If the buffer is not
+ * modified within the transaction, decrement the recursion count and
+ * release the buffer if the recursion count goes to 0.
+ *
+ * If the buffer is to be released and it was not modified before
+ * this transaction began, then free the buf_log_item associated with it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * brelse() call.
+ */
+void
+xfs_trans_brelse(xfs_trans_t    *tp,
+                 xfs_buf_t      *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        xfs_log_item_t          *lip;
+        xfs_log_item_desc_t     *lidp;
+        /*
+         * Default to a normal brelse() call if the tp is NULL.
+         */
+        if (tp == NULL) {
+                ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+                /*
+                 * If there's a buf log item attached to the buffer,
+                 * then let the AIL know that the buffer is being
+                 * unlocked.
+                 */
+                if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+                        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                        if (lip->li_type == XFS_LI_BUF) {
+                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+                                xfs_trans_unlocked_item(
+                                                bip->bli_item.li_mountp,
+                                                lip);
+                        }
+                }
+                xfs_buf_relse(bp);
+                return;
+        }
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        /*
+         * Find the item descriptor pointing to this buffer's
+         * log item.  It must be there.
+         */
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+        ASSERT(lidp != NULL);
+        /*
+         * If the release is just for a recursive lock,
+         * then decrement the count and return.
+         */
+        if (bip->bli_recur > 0) {
+                bip->bli_recur--;
+                xfs_buf_item_trace("RELSE RECUR", bip);
+                return;
+        }
+        /*
+         * If the buffer is dirty within this transaction, we can't
+         * release it until we commit.
+         */
+        if (lidp->lid_flags & XFS_LID_DIRTY) {
+                xfs_buf_item_trace("RELSE DIRTY", bip);
+                return;
+        }
+        /*
+         * If the buffer has been invalidated, then we can't release
+         * it until the transaction commits to disk unless it is re-dirtied
+         * as part of this transaction.  This prevents us from pulling
+         * the item from the AIL before we should.
+         */
+        if (bip->bli_flags & XFS_BLI_STALE) {
+                xfs_buf_item_trace("RELSE STALE", bip);
+                return;
+        }
+        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+        xfs_buf_item_trace("RELSE", bip);
+        /*
+         * Free up the log item descriptor tracking the released item.
+         */
+        xfs_trans_free_item(tp, lidp);
+        /*
+         * Clear the hold flag in the buf log item if it is set.
+         * We wouldn't want the next user of the buffer to
+         * get confused.
+         */
+        if (bip->bli_flags & XFS_BLI_HOLD) {
+                bip->bli_flags &= ~XFS_BLI_HOLD;
+        }
+        /*
+         * Drop our reference to the buf log item.
+         */
+        atomic_dec(&bip->bli_refcount);
+        /*
+         * If the buf item is not tracking data in the log, then
+         * we must free it before releasing the buffer back to the
+         * free pool.  Before releasing the buffer to the free pool,
+         * clear the transaction pointer in b_fsprivate2 to dissolve
+         * its relation to this transaction.
+         */
+        if (!xfs_buf_item_dirty(bip)) {
+/***
+                ASSERT(bp->b_pincount == 0);
+***/
+                ASSERT(atomic_read(&bip->bli_refcount) == 0);
+                ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+                ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+                xfs_buf_item_relse(bp);
+                bip = NULL;
+        }
+        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+        /*
+         * If we've still got a buf log item on the buffer, then
+         * tell the AIL that the buffer is being unlocked.
+         */
+        if (bip != NULL) {
+                xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+                                        (xfs_log_item_t*)bip);
+        }
+        xfs_buf_relse(bp);
+        return;
+}
+/*
+ * Add the locked buffer to the transaction.
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+void
+xfs_trans_bjoin(xfs_trans_t     *tp,
+                xfs_buf_t       *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+        /*
+         * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+         * it doesn't have one yet, then allocate one and initialize it.
+         * The checks to see if one is there are in xfs_buf_item_init().
+         */
+        xfs_buf_item_init(bp, tp->t_mountp);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+        /*
+         * Take a reference for this transaction on the buf item.
+         */
+        atomic_inc(&bip->bli_refcount);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+        /*
+         * Initialize b_fsprivate2 so we can find it with incore_match()
+         * in xfs_trans_get_buf() and friends above.
+         */
+        XFS_BUF_SET_FSPRIVATE2(bp, tp);
+        xfs_buf_item_trace("BJOIN", bip);
+}
+/*
+ * Mark the buffer as not needing to be unlocked when the buf item's
+ * IOP_UNLOCK() routine is called.  The buffer must already be locked
+ * and associated with the given transaction.
+ */
+/* ARGSUSED */
+void
+xfs_trans_bhold(xfs_trans_t     *tp,
+                xfs_buf_t       *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        bip->bli_flags |= XFS_BLI_HOLD;
+        xfs_buf_item_trace("BHOLD", bip);
+}
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(xfs_trans_t   *tp,
+                  xfs_buf_t     *bp,
+                  uint          first,
+                  uint          last)
+{
+        xfs_buf_log_item_t      *bip;
+        xfs_log_item_desc_t     *lidp;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
+        ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) ||
+               (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks));
+        /*
+         * Mark the buffer as needing to be written out eventually,
+         * and set its iodone function to remove the buffer's buf log
+         * item from the AIL and free it when the buffer is flushed
+         * to disk.  See xfs_buf_attach_iodone() for more details
+         * on li_cb and xfs_buf_iodone_callbacks().
+         * If we end up aborting this transaction, we trap this buffer
+         * inside the b_bdstrat callback so that this won't get written to
+         * disk.
+         */
+        XFS_BUF_DELAYWRITE(bp);
+        XFS_BUF_DONE(bp);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+        bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+        /*
+         * If we invalidated the buffer within this transaction, then
+         * cancel the invalidation now that we're dirtying the buffer
+         * again.  There are no races with the code in xfs_buf_item_unpin(),
+         * because we have a reference to the buffer this entire time.
+         */
+        if (bip->bli_flags & XFS_BLI_STALE) {
+                xfs_buf_item_trace("BLOG UNSTALE", bip);
+                bip->bli_flags &= ~XFS_BLI_STALE;
+                ASSERT(XFS_BUF_ISSTALE(bp));
+                XFS_BUF_UNSTALE(bp);
+                bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+        }
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+        ASSERT(lidp != NULL);
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        lidp->lid_flags |= XFS_LID_DIRTY;
+        lidp->lid_flags &= ~XFS_LID_BUF_STALE;
+        bip->bli_flags |= XFS_BLI_LOGGED;
+        xfs_buf_item_log(bip, first, last);
+        xfs_buf_item_trace("BLOG", bip);
+}
+/*
+ * This called to invalidate a buffer that is being used within
+ * a transaction.  Typically this is because the blocks in the
+ * buffer are being freed, so we need to prevent it from being
+ * written out when we're done.  Allowing it to be written again
+ * might overwrite data in the free blocks if they are reallocated
+ * to a file.
+ *
+ * We prevent the buffer from being written out by clearing the
+ * B_DELWRI flag.  We can't always
+ * get rid of the buf log item at this point, though, because
+ * the buffer may still be pinned by another transaction.  If that
+ * is the case, then we'll wait until the buffer is committed to
+ * disk for the last time (we can tell by the ref count) and
+ * free it in xfs_buf_item_unpin().  Until it is cleaned up we
+ * will keep the buffer locked so that the buffer and buf log item
+ * are not reused.
+ */
+void
+xfs_trans_binval(
+        xfs_trans_t     *tp,
+        xfs_buf_t       *bp)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+        ASSERT(lidp != NULL);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        if (bip->bli_flags & XFS_BLI_STALE) {
+                /*
+                 * If the buffer is already invalidated, then
+                 * just return.
+                 */
+                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+                ASSERT(XFS_BUF_ISSTALE(bp));
+                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
+                ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
+                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
+                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+                xfs_buftrace("XFS_BINVAL RECUR", bp);
+                xfs_buf_item_trace("BINVAL RECUR", bip);
+                return;
+        }
+        /*
+         * Clear the dirty bit in the buffer and set the STALE flag
+         * in the buf log item.  The STALE flag will be used in
+         * xfs_buf_item_unpin() to determine if it should clean up
+         * when the last reference to the buf item is given up.
+         * We set the XFS_BLI_CANCEL flag in the buf log format structure
+         * and log the buf item.  This will be used at recovery time
+         * to determine that copies of the buffer in the log before
+         * this should not be replayed.
+         * We mark the item descriptor and the transaction dirty so
+         * that we'll hold the buffer until after the commit.
+         *
+         * Since we're invalidating the buffer, we also clear the state
+         * about which parts of the buffer have been logged.  We also
+         * clear the flag indicating that this is an inode buffer since
+         * the data in the buffer will no longer be valid.
+         *
+         * We set the stale bit in the buffer as well since we're getting
+         * rid of it.
+         */
+        XFS_BUF_UNDELAYWRITE(bp);
+        XFS_BUF_STALE(bp);
+        bip->bli_flags |= XFS_BLI_STALE;
+        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+        bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
+        bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+        memset((char *)(bip->bli_format.blf_data_map), 0,
+              (bip->bli_format.blf_map_size * sizeof(uint)));
+        lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE;
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        xfs_buftrace("XFS_BINVAL", bp);
+        xfs_buf_item_trace("BINVAL", bip);
+}
+/*
+ * This call is used to indicate that the buffer contains on-disk
+ * inodes which must be handled specially during recovery.  They
+ * require special handling because only the di_next_unlinked from
+ * the inodes in the buffer should be recovered.  The rest of the
+ * data in the buffer is logged via the inodes themselves.
+ *
+ * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
+ * format structure so that we'll know what to do at recovery time.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_buf(
+        xfs_trans_t     *tp,
+        xfs_buf_t       *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+}
+/*
+ * This call is used to indicate that the buffer is going to
+ * be staled and was an inode buffer. This means it gets
+ * special processing during unpin - where any inodes 
+ * associated with the buffer should be removed from ail.
+ * There is also special processing during recovery,
+ * any replay of the inodes in the buffer needs to be
+ * prevented as the buffer may have been reused.
+ */
+void
+xfs_trans_stale_inode_buf(
+        xfs_trans_t     *tp,
+        xfs_buf_t       *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        bip->bli_flags |= XFS_BLI_STALE_INODE;
+        bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))
+                xfs_buf_iodone;
+}
+/*
+ * Mark the buffer as being one which contains newly allocated
+ * inodes.  We need to make sure that even if this buffer is
+ * relogged as an 'inode buf' we still recover all of the inode
+ * images in the face of a crash.  This works in coordination with
+ * xfs_buf_item_committed() to ensure that the buffer remains in the
+ * AIL at its original location even after it has been relogged.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_alloc_buf(
+        xfs_trans_t     *tp,
+        xfs_buf_t       *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+}
+/*
+ * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
+ * dquots. However, unlike in inode buffer recovery, dquot buffers get
+ * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
+ * The only thing that makes dquot buffers different from regular
+ * buffers is that we must not replay dquot bufs when recovering
+ * if a _corresponding_ quotaoff has happened. We also have to distinguish
+ * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
+ * can be turned off independently.
+ */
+/* ARGSUSED */
+void
+xfs_trans_dquot_buf(
+        xfs_trans_t     *tp,
+        xfs_buf_t       *bp,
+        uint            type)
+{
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        ASSERT(type == XFS_BLI_UDQUOT_BUF ||
+               type == XFS_BLI_GDQUOT_BUF);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        bip->bli_format.blf_flags |= type;
+}
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.  Only check the first, embedded
+ * chunk, since we don't want to spend all day scanning large transactions.
+ */
+STATIC xfs_buf_t *
+xfs_trans_buf_item_match(
+        xfs_trans_t     *tp,
+        xfs_buftarg_t   *target,
+        xfs_daddr_t     blkno,
+        int             len)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_desc_t     *lidp;
+        xfs_buf_log_item_t      *blip;
+        xfs_buf_t               *bp;
+        int                     i;
+        bp = NULL;
+        len = BBTOB(len);
+        licp = &tp->t_items;
+        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+                for (i = 0; i < licp->lic_unused; i++) {
+                        /*
+                         * Skip unoccupied slots.
+                         */
+                        if (XFS_LIC_ISFREE(licp, i)) {
+                                continue;
+                        }
+                        lidp = XFS_LIC_SLOT(licp, i);
+                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
+                        if (blip->bli_item.li_type != XFS_LI_BUF) {
+                                continue;
+                        }
+                        bp = blip->bli_buf;
+                        if ((XFS_BUF_TARGET(bp) == target) &&
+                            (XFS_BUF_ADDR(bp) == blkno) &&
+                            (XFS_BUF_COUNT(bp) == len)) {
+                                /*
+                                 * We found it.  Break out and
+                                 * return the pointer to the buffer.
+                                 */
+                                break;
+                        } else {
+                                bp = NULL;
+                        }
+                }
+        }
+        return bp;
+}
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.  Check all the chunks, we
+ * want to be thorough.
+ */
+STATIC xfs_buf_t *
+xfs_trans_buf_item_match_all(
+        xfs_trans_t     *tp,
+        xfs_buftarg_t   *target,
+        xfs_daddr_t     blkno,
+        int             len)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_desc_t     *lidp;
+        xfs_buf_log_item_t      *blip;
+        xfs_buf_t               *bp;
+        int                     i;
+        bp = NULL;
+        len = BBTOB(len);
+        for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                        ASSERT(licp == &tp->t_items);
+                        ASSERT(licp->lic_next == NULL);
+                        return NULL;
+                }
+                for (i = 0; i < licp->lic_unused; i++) {
+                        /*
+                         * Skip unoccupied slots.
+                         */
+                        if (XFS_LIC_ISFREE(licp, i)) {
+                                continue;
+                        }
+                        lidp = XFS_LIC_SLOT(licp, i);
+                        blip = (xfs_buf_log_item_t *)lidp->lid_item;
+                        if (blip->bli_item.li_type != XFS_LI_BUF) {
+                                continue;
+                        }
+                        bp = blip->bli_buf;
+                        if ((XFS_BUF_TARGET(bp) == target) &&
+                            (XFS_BUF_ADDR(bp) == blkno) &&
+                            (XFS_BUF_COUNT(bp) == len)) {
+                                /*
+                                 * We found it.  Break out and
+                                 * return the pointer to the buffer.
+                                 */
+                                return bp;
+                        }
+                }
+        }
+        return NULL;
+}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
new file mode 100644
index 000000000000..93259a15f983
--- /dev/null
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_extfree_item.h"
+/*
+ * This routine is called to allocate an "extent free intention"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efi_log_item_t *
+xfs_trans_get_efi(xfs_trans_t   *tp,
+                  uint          nextents)
+{
+        xfs_efi_log_item_t      *efip;
+        ASSERT(tp != NULL);
+        ASSERT(nextents > 0);
+        efip = xfs_efi_init(tp->t_mountp, nextents);
+        ASSERT(efip != NULL);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
+        return (efip);
+}
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as needing to be freed.  It should
+ * be called once for each extent to be freed.
+ */
+void
+xfs_trans_log_efi_extent(xfs_trans_t            *tp,
+                         xfs_efi_log_item_t     *efip,
+                         xfs_fsblock_t          start_block,
+                         xfs_extlen_t           ext_len)
+{
+        xfs_log_item_desc_t     *lidp;
+        uint                    next_extent;
+        xfs_extent_t            *extp;
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
+        ASSERT(lidp != NULL);
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        lidp->lid_flags |= XFS_LID_DIRTY;
+        next_extent = efip->efi_next_extent;
+        ASSERT(next_extent < efip->efi_format.efi_nextents);
+        extp = &(efip->efi_format.efi_extents[next_extent]);
+        extp->ext_start = start_block;
+        extp->ext_len = ext_len;
+        efip->efi_next_extent++;
+}
+/*
+ * This routine is called to allocate an "extent free done"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efd_log_item_t *
+xfs_trans_get_efd(xfs_trans_t           *tp,
+                  xfs_efi_log_item_t    *efip,
+                  uint                  nextents)
+{
+        xfs_efd_log_item_t      *efdp;
+        ASSERT(tp != NULL);
+        ASSERT(nextents > 0);
+        efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
+        ASSERT(efdp != NULL);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
+        return (efdp);
+}
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as having been freed.  It should
+ * be called once for each extent freed.
+ */
+void
+xfs_trans_log_efd_extent(xfs_trans_t            *tp,
+                         xfs_efd_log_item_t     *efdp,
+                         xfs_fsblock_t          start_block,
+                         xfs_extlen_t           ext_len)
+{
+        xfs_log_item_desc_t     *lidp;
+        uint                    next_extent;
+        xfs_extent_t            *extp;
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
+        ASSERT(lidp != NULL);
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        lidp->lid_flags |= XFS_LID_DIRTY;
+        next_extent = efdp->efd_next_extent;
+        ASSERT(next_extent < efdp->efd_format.efd_nextents);
+        extp = &(efdp->efd_format.efd_extents[next_extent]);
+        extp->ext_start = start_block;
+        extp->ext_len = ext_len;
+        efdp->efd_next_extent++;
+}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
new file mode 100644
index 000000000000..e2c3706f453d
--- /dev/null
+++ b/fs/xfs/xfs_trans_inode.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_trans_priv.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#ifdef XFS_TRANS_DEBUG
+STATIC void
+xfs_trans_inode_broot_debug(
+        xfs_inode_t     *ip);
+#else
+#define xfs_trans_inode_broot_debug(ip)
+#endif
+/*
+ * Get and lock the inode for the caller if it is not already
+ * locked within the given transaction.  If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * For an inode to be locked in a transaction, the inode lock, as
+ * opposed to the io lock, must be taken exclusively.  This ensures
+ * that the inode can be involved in only 1 transaction at a time.
+ * Lock recursion is handled on the io lock, but only for lock modes
+ * of equal or lesser strength.  That is, you can recur on the io lock
+ * held EXCL with a SHARED request but not vice versa.  Also, if
+ * the inode is already a part of the transaction then you cannot
+ * go from not holding the io lock to having it EXCL or SHARED.
+ *
+ * Use the inode cache routine xfs_inode_incore() to find the inode
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the inode, use xfs_iget() to get it.
+ * Since the inode log item structure is embedded in the incore
+ * inode structure and is initialized when the inode is brought
+ * into memory, there is nothing to do with it here.
+ *
+ * If the given transaction pointer is NULL, just call xfs_iget().
+ * This simplifies code which must handle both cases.
+ */
+int
+xfs_trans_iget(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        uint            flags,
+        uint            lock_flags,
+        xfs_inode_t     **ipp)
+{
+        int                     error;
+        xfs_inode_t             *ip;
+        xfs_inode_log_item_t    *iip;
+        /*
+         * If the transaction pointer is NULL, just call the normal
+         * xfs_iget().
+         */
+        if (tp == NULL)
+                return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
+        /*
+         * If we find the inode in core with this transaction
+         * pointer in its i_transp field, then we know we already
+         * have it locked.  In this case we just increment the lock
+         * recursion count and return the inode to the caller.
+         * Assert that the inode is already locked in the mode requested
+         * by the caller.  We cannot do lock promotions yet, so
+         * die if someone gets this wrong.
+         */
+        if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
+                /*
+                 * Make sure that the inode lock is held EXCL and
+                 * that the io lock is never upgraded when the inode
+                 * is already a part of the transaction.
+                 */
+                ASSERT(ip->i_itemp != NULL);
+                ASSERT(lock_flags & XFS_ILOCK_EXCL);
+                ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+                ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+                       ismrlocked(&ip->i_iolock, MR_UPDATE));
+                ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+                       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
+                ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+                       ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS)));
+                ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+                       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
+                if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
+                        ip->i_itemp->ili_iolock_recur++;
+                }
+                if (lock_flags & XFS_ILOCK_EXCL) {
+                        ip->i_itemp->ili_ilock_recur++;
+                }
+                *ipp = ip;
+                return 0;
+        }
+        ASSERT(lock_flags & XFS_ILOCK_EXCL);
+        error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
+        if (error) {
+                return error;
+        }
+        ASSERT(ip != NULL);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        if (ip->i_itemp == NULL)
+                xfs_inode_item_init(ip, mp);
+        iip = ip->i_itemp;
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
+        xfs_trans_inode_broot_debug(ip);
+        /*
+         * If the IO lock has been acquired, mark that in
+         * the inode log item so we'll know to unlock it
+         * when the transaction commits.
+         */
+        ASSERT(iip->ili_flags == 0);
+        if (lock_flags & XFS_IOLOCK_EXCL) {
+                iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
+        } else if (lock_flags & XFS_IOLOCK_SHARED) {
+                iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
+        }
+        /*
+         * Initialize i_transp so we can find it with xfs_inode_incore()
+         * above.
+         */
+        ip->i_transp = tp;
+        *ipp = ip;
+        return 0;
+}
+/*
+ * Add the locked inode to the transaction.
+ * The inode must be locked, and it cannot be associated with any
+ * transaction.  The caller must specify the locks already held
+ * on the inode.
+ */
+void
+xfs_trans_ijoin(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        uint            lock_flags)
+{
+        xfs_inode_log_item_t    *iip;
+        ASSERT(ip->i_transp == NULL);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+        ASSERT(lock_flags & XFS_ILOCK_EXCL);
+        if (ip->i_itemp == NULL)
+                xfs_inode_item_init(ip, ip->i_mount);
+        iip = ip->i_itemp;
+        ASSERT(iip->ili_flags == 0);
+        ASSERT(iip->ili_ilock_recur == 0);
+        ASSERT(iip->ili_iolock_recur == 0);
+        /*
+         * Get a log_item_desc to point at the new item.
+         */
+        (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
+        xfs_trans_inode_broot_debug(ip);
+        /*
+         * If the IO lock is already held, mark that in the inode log item.
+         */
+        if (lock_flags & XFS_IOLOCK_EXCL) {
+                iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
+        } else if (lock_flags & XFS_IOLOCK_SHARED) {
+                iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
+        }
+        /*
+         * Initialize i_transp so we can find it with xfs_inode_incore()
+         * in xfs_trans_iget() above.
+         */
+        ip->i_transp = tp;
+}
+/*
+ * Mark the inode as not needing to be unlocked when the inode item's
+ * IOP_UNLOCK() routine is called.  The inode must already be locked
+ * and associated with the given transaction.
+ */
+/*ARGSUSED*/
+void
+xfs_trans_ihold(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip)
+{
+        ASSERT(ip->i_transp == tp);
+        ASSERT(ip->i_itemp != NULL);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+        ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
+}
+/*
+ * Cancel the previous inode hold request made on this inode
+ * for this transaction.
+ */
+/*ARGSUSED*/
+void
+xfs_trans_ihold_release(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip)
+{
+        ASSERT(ip->i_transp == tp);
+        ASSERT(ip->i_itemp != NULL);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+        ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+        ip->i_itemp->ili_flags &= ~XFS_ILI_HOLD;
+}
+/*
+ * This is called to mark the fields indicated in fieldmask as needing
+ * to be logged when the transaction is committed.  The inode must
+ * already be associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h.  We always
+ * log all of the core inode if any of it has changed, and we always log
+ * all of the inline data/extents/b-tree root if any of them has changed.
+ */
+void
+xfs_trans_log_inode(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip,
+        uint            flags)
+{
+        xfs_log_item_desc_t     *lidp;
+        ASSERT(ip->i_transp == tp);
+        ASSERT(ip->i_itemp != NULL);
+        ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
+        ASSERT(lidp != NULL);
+        tp->t_flags |= XFS_TRANS_DIRTY;
+        lidp->lid_flags |= XFS_LID_DIRTY;
+        /*
+         * Always OR in the bits from the ili_last_fields field.
+         * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
+         * routines in the eventual clearing of the ilf_fields bits.
+         * See the big comment in xfs_iflush() for an explanation of
+         * this coorination mechanism.
+         */
+        flags |= ip->i_itemp->ili_last_fields;
+        ip->i_itemp->ili_format.ilf_fields |= flags;
+}
+#ifdef XFS_TRANS_DEBUG
+/*
+ * Keep track of the state of the inode btree root to make sure we
+ * log it properly.
+ */
+STATIC void
+xfs_trans_inode_broot_debug(
+        xfs_inode_t     *ip)
+{
+        xfs_inode_log_item_t    *iip;
+        ASSERT(ip->i_itemp != NULL);
+        iip = ip->i_itemp;
+        if (iip->ili_root_size != 0) {
+                ASSERT(iip->ili_orig_root != NULL);
+                kmem_free(iip->ili_orig_root, iip->ili_root_size);
+                iip->ili_root_size = 0;
+                iip->ili_orig_root = NULL;
+        }
+        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+                ASSERT((ip->i_df.if_broot != NULL) &&
+                       (ip->i_df.if_broot_bytes > 0));
+                iip->ili_root_size = ip->i_df.if_broot_bytes;
+                iip->ili_orig_root =
+                        (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
+                memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
+                      iip->ili_root_size);
+        }
+}
+#endif
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
new file mode 100644
index 000000000000..1b8a756d80ed
--- /dev/null
+++ b/fs/xfs/xfs_trans_item.c
@@ -0,0 +1,553 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+STATIC int      xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
+                                        int, int, xfs_lsn_t);
+/*
+ * This is called to add the given log item to the transaction's
+ * list of log items.  It must find a free log item descriptor
+ * or allocate a new one and add the item to that descriptor.
+ * The function returns a pointer to item descriptor used to point
+ * to the new item.  The log item will now point to its new descriptor
+ * with its li_desc field.
+ */
+xfs_log_item_desc_t *
+xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_item_chunk_t    *licp;
+        int                     i=0;
+        /*
+         * If there are no free descriptors, allocate a new chunk
+         * of them and put it at the front of the chunk list.
+         */
+        if (tp->t_items_free == 0) {
+                licp = (xfs_log_item_chunk_t*)
+                       kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
+                ASSERT(licp != NULL);
+                /*
+                 * Initialize the chunk, and then
+                 * claim the first slot in the newly allocated chunk.
+                 */
+                XFS_LIC_INIT(licp);
+                XFS_LIC_CLAIM(licp, 0);
+                licp->lic_unused = 1;
+                XFS_LIC_INIT_SLOT(licp, 0);
+                lidp = XFS_LIC_SLOT(licp, 0);
+                /*
+                 * Link in the new chunk and update the free count.
+                 */
+                licp->lic_next = tp->t_items.lic_next;
+                tp->t_items.lic_next = licp;
+                tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
+                /*
+                 * Initialize the descriptor and the generic portion
+                 * of the log item.
+                 *
+                 * Point the new slot at this item and return it.
+                 * Also point the log item at its currently active
+                 * descriptor and set the item's mount pointer.
+                 */
+                lidp->lid_item = lip;
+                lidp->lid_flags = 0;
+                lidp->lid_size = 0;
+                lip->li_desc = lidp;
+                lip->li_mountp = tp->t_mountp;
+                return (lidp);
+        }
+        /*
+         * Find the free descriptor. It is somewhere in the chunklist
+         * of descriptors.
+         */
+        licp = &tp->t_items;
+        while (licp != NULL) {
+                if (XFS_LIC_VACANCY(licp)) {
+                        if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
+                                i = licp->lic_unused;
+                                ASSERT(XFS_LIC_ISFREE(licp, i));
+                                break;
+                        }
+                        for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
+                                if (XFS_LIC_ISFREE(licp, i))
+                                        break;
+                        }
+                        ASSERT(i <= XFS_LIC_MAX_SLOT);
+                        break;
+                }
+                licp = licp->lic_next;
+        }
+        ASSERT(licp != NULL);
+        /*
+         * If we find a free descriptor, claim it,
+         * initialize it, and return it.
+         */
+        XFS_LIC_CLAIM(licp, i);
+        if (licp->lic_unused <= i) {
+                licp->lic_unused = i + 1;
+                XFS_LIC_INIT_SLOT(licp, i);
+        }
+        lidp = XFS_LIC_SLOT(licp, i);
+        tp->t_items_free--;
+        lidp->lid_item = lip;
+        lidp->lid_flags = 0;
+        lidp->lid_size = 0;
+        lip->li_desc = lidp;
+        lip->li_mountp = tp->t_mountp;
+        return (lidp);
+}
+/*
+ * Free the given descriptor.
+ *
+ * This requires setting the bit in the chunk's free mask corresponding
+ * to the given slot.
+ */
+void
+xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
+{
+        uint                    slot;
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_chunk_t    **licpp;
+        slot = XFS_LIC_DESC_TO_SLOT(lidp);
+        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        XFS_LIC_RELSE(licp, slot);
+        lidp->lid_item->li_desc = NULL;
+        tp->t_items_free++;
+        /*
+         * If there are no more used items in the chunk and this is not
+         * the chunk embedded in the transaction structure, then free
+         * the chunk. First pull it from the chunk list and then
+         * free it back to the heap.  We didn't bother with a doubly
+         * linked list here because the lists should be very short
+         * and this is not a performance path.  It's better to save
+         * the memory of the extra pointer.
+         *
+         * Also decrement the transaction structure's count of free items
+         * by the number in a chunk since we are freeing an empty chunk.
+         */
+        if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+                licpp = &(tp->t_items.lic_next);
+                while (*licpp != licp) {
+                        ASSERT(*licpp != NULL);
+                        licpp = &((*licpp)->lic_next);
+                }
+                *licpp = licp->lic_next;
+                kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                tp->t_items_free -= XFS_LIC_NUM_SLOTS;
+        }
+}
+/*
+ * This is called to find the descriptor corresponding to the given
+ * log item.  It returns a pointer to the descriptor.
+ * The log item MUST have a corresponding descriptor in the given
+ * transaction.  This routine does not return NULL, it panics.
+ *
+ * The descriptor pointer is kept in the log item's li_desc field.
+ * Just return it.
+ */
+/*ARGSUSED*/
+xfs_log_item_desc_t *
+xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
+{
+        ASSERT(lip->li_desc != NULL);
+        return (lip->li_desc);
+}
+/*
+ * Return a pointer to the first descriptor in the chunk list.
+ * This does not return NULL if there are none, it panics.
+ *
+ * The first descriptor must be in either the first or second chunk.
+ * This is because the only chunk allowed to be empty is the first.
+ * All others are freed when they become empty.
+ *
+ * At some point this and xfs_trans_next_item() should be optimized
+ * to quickly look at the mask to determine if there is anything to
+ * look at.
+ */
+xfs_log_item_desc_t *
+xfs_trans_first_item(xfs_trans_t *tp)
+{
+        xfs_log_item_chunk_t    *licp;
+        int                     i;
+        licp = &tp->t_items;
+        /*
+         * If it's not in the first chunk, skip to the second.
+         */
+        if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                licp = licp->lic_next;
+        }
+        /*
+         * Return the first non-free descriptor in the chunk.
+         */
+        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        for (i = 0; i < licp->lic_unused; i++) {
+                if (XFS_LIC_ISFREE(licp, i)) {
+                        continue;
+                }
+                return (XFS_LIC_SLOT(licp, i));
+        }
+        cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
+        return(NULL);
+}
+/*
+ * Given a descriptor, return the next descriptor in the chunk list.
+ * This returns NULL if there are no more used descriptors in the list.
+ *
+ * We do this by first locating the chunk in which the descriptor resides,
+ * and then scanning forward in the chunk and the list for the next
+ * used descriptor.
+ */
+/*ARGSUSED*/
+xfs_log_item_desc_t *
+xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
+{
+        xfs_log_item_chunk_t    *licp;
+        int                     i;
+        licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+        /*
+         * First search the rest of the chunk. The for loop keeps us
+         * from referencing things beyond the end of the chunk.
+         */
+        for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
+                if (XFS_LIC_ISFREE(licp, i)) {
+                        continue;
+                }
+                return (XFS_LIC_SLOT(licp, i));
+        }
+        /*
+         * Now search the next chunk.  It must be there, because the
+         * next chunk would have been freed if it were empty.
+         * If there is no next chunk, return NULL.
+         */
+        if (licp->lic_next == NULL) {
+                return (NULL);
+        }
+        licp = licp->lic_next;
+        ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+        for (i = 0; i < licp->lic_unused; i++) {
+                if (XFS_LIC_ISFREE(licp, i)) {
+                        continue;
+                }
+                return (XFS_LIC_SLOT(licp, i));
+        }
+        ASSERT(0);
+        /* NOTREACHED */
+        return NULL; /* keep gcc quite */
+}
+/*
+ * This is called to unlock all of the items of a transaction and to free
+ * all the descriptors of that transaction.
+ *
+ * It walks the list of descriptors and unlocks each item.  It frees
+ * each chunk except that embedded in the transaction as it goes along.
+ */
+void
+xfs_trans_free_items(
+        xfs_trans_t     *tp,
+        int             flags)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_chunk_t    *next_licp;
+        int                     abort;
+        abort = flags & XFS_TRANS_ABORT;
+        licp = &tp->t_items;
+        /*
+         * Special case the embedded chunk so we don't free it below.
+         */
+        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                XFS_LIC_ALL_FREE(licp);
+                licp->lic_unused = 0;
+        }
+        licp = licp->lic_next;
+        /*
+         * Unlock each item in each chunk and free the chunks.
+         */
+        while (licp != NULL) {
+                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                next_licp = licp->lic_next;
+                kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                licp = next_licp;
+        }
+        /*
+         * Reset the transaction structure's free item count.
+         */
+        tp->t_items_free = XFS_LIC_NUM_SLOTS;
+        tp->t_items.lic_next = NULL;
+}
+/*
+ * This is called to unlock the items associated with a transaction.
+ * Items which were not logged should be freed.
+ * Those which were logged must still be tracked so they can be unpinned
+ * when the transaction commits.
+ */
+void
+xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
+{
+        xfs_log_item_chunk_t    *licp;
+        xfs_log_item_chunk_t    *next_licp;
+        xfs_log_item_chunk_t    **licpp;
+        int                     freed;
+        freed = 0;
+        licp = &tp->t_items;
+        /*
+         * Special case the embedded chunk so we don't free.
+         */
+        if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+                freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
+        }
+        licpp = &(tp->t_items.lic_next);
+        licp = licp->lic_next;
+        /*
+         * Unlock each item in each chunk, free non-dirty descriptors,
+         * and free empty chunks.
+         */
+        while (licp != NULL) {
+                ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+                freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
+                next_licp = licp->lic_next;
+                if (XFS_LIC_ARE_ALL_FREE(licp)) {
+                        *licpp = next_licp;
+                        kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+                        freed -= XFS_LIC_NUM_SLOTS;
+                } else {
+                        licpp = &(licp->lic_next);
+                }
+                ASSERT(*licpp == next_licp);
+                licp = next_licp;
+        }
+        /*
+         * Fix the free descriptor count in the transaction.
+         */
+        tp->t_items_free += freed;
+}
+/*
+ * Unlock each item pointed to by a descriptor in the given chunk.
+ * Stamp the commit lsn into each item if necessary.
+ * Free descriptors pointing to items which are not dirty if freeing_chunk
+ * is zero. If freeing_chunk is non-zero, then we need to unlock all
+ * items in the chunk.
+ * 
+ * Return the number of descriptors freed.
+ */
+STATIC int
+xfs_trans_unlock_chunk(
+        xfs_log_item_chunk_t    *licp,
+        int                     freeing_chunk,
+        int                     abort,
+        xfs_lsn_t               commit_lsn)
+{
+        xfs_log_item_desc_t     *lidp;
+        xfs_log_item_t          *lip;
+        int                     i;
+        int                     freed;
+        freed = 0;
+        lidp = licp->lic_descs;
+        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                if (XFS_LIC_ISFREE(licp, i)) {
+                        continue;
+                }
+                lip = lidp->lid_item;
+                lip->li_desc = NULL;
+                if (commit_lsn != NULLCOMMITLSN)
+                        IOP_COMMITTING(lip, commit_lsn);
+                if (abort)
+                        lip->li_flags |= XFS_LI_ABORTED;
+                IOP_UNLOCK(lip);
+                /*
+                 * Free the descriptor if the item is not dirty
+                 * within this transaction and the caller is not
+                 * going to just free the entire thing regardless.
+                 */
+                if (!(freeing_chunk) &&
+                    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
+                        XFS_LIC_RELSE(licp, i);
+                        freed++;
+                }
+        }
+        return (freed);
+}
+/*
+ * This is called to add the given busy item to the transaction's
+ * list of busy items.  It must find a free busy item descriptor
+ * or allocate a new one and add the item to that descriptor.
+ * The function returns a pointer to busy descriptor used to point
+ * to the new busy entry.  The log busy entry will now point to its new
+ * descriptor with its ???? field.
+ */
+xfs_log_busy_slot_t *
+xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
+{
+        xfs_log_busy_chunk_t    *lbcp;
+        xfs_log_busy_slot_t     *lbsp;
+        int                     i=0;
+        /*
+         * If there are no free descriptors, allocate a new chunk
+         * of them and put it at the front of the chunk list.
+         */
+        if (tp->t_busy_free == 0) {
+                lbcp = (xfs_log_busy_chunk_t*)
+                       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
+                ASSERT(lbcp != NULL);
+                /*
+                 * Initialize the chunk, and then
+                 * claim the first slot in the newly allocated chunk.
+                 */
+                XFS_LBC_INIT(lbcp);
+                XFS_LBC_CLAIM(lbcp, 0);
+                lbcp->lbc_unused = 1;
+                lbsp = XFS_LBC_SLOT(lbcp, 0);
+                /*
+                 * Link in the new chunk and update the free count.
+                 */
+                lbcp->lbc_next = tp->t_busy.lbc_next;
+                tp->t_busy.lbc_next = lbcp;
+                tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
+                /*
+                 * Initialize the descriptor and the generic portion
+                 * of the log item.
+                 *
+                 * Point the new slot at this item and return it.
+                 * Also point the log item at its currently active
+                 * descriptor and set the item's mount pointer.
+                 */
+                lbsp->lbc_ag = ag;
+                lbsp->lbc_idx = idx;
+                return (lbsp);
+        }
+        /*
+         * Find the free descriptor. It is somewhere in the chunklist
+         * of descriptors.
+         */
+        lbcp = &tp->t_busy;
+        while (lbcp != NULL) {
+                if (XFS_LBC_VACANCY(lbcp)) {
+                        if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
+                                i = lbcp->lbc_unused;
+                                break;
+                        } else {
+                                /* out-of-order vacancy */
+                                printk("OOO vacancy lbcp 0x%p\n", lbcp);
+                                ASSERT(0);
+                        }
+                }
+                lbcp = lbcp->lbc_next;
+        }
+        ASSERT(lbcp != NULL);
+        /*
+         * If we find a free descriptor, claim it,
+         * initialize it, and return it.
+         */
+        XFS_LBC_CLAIM(lbcp, i);
+        if (lbcp->lbc_unused <= i) {
+                lbcp->lbc_unused = i + 1;
+        }
+        lbsp = XFS_LBC_SLOT(lbcp, i);
+        tp->t_busy_free--;
+        lbsp->lbc_ag = ag;
+        lbsp->lbc_idx = idx;
+        return (lbsp);
+}
+/*
+ * xfs_trans_free_busy
+ * Free all of the busy lists from a transaction
+ */
+void
+xfs_trans_free_busy(xfs_trans_t *tp)
+{
+        xfs_log_busy_chunk_t    *lbcp;
+        xfs_log_busy_chunk_t    *lbcq;
+        lbcp = tp->t_busy.lbc_next;
+        while (lbcp != NULL) {
+                lbcq = lbcp->lbc_next;
+                kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t));
+                lbcp = lbcq;
+        }
+        XFS_LBC_INIT(&tp->t_busy);
+        tp->t_busy.lbc_unused = 0;
+}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
new file mode 100644
index 000000000000..d4dae7d06afc
--- /dev/null
+++ b/fs/xfs/xfs_trans_priv.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_PRIV_H__
+#define __XFS_TRANS_PRIV_H__
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+/*
+ * From xfs_trans_item.c
+ */
+struct xfs_log_item_desc        *xfs_trans_add_item(struct xfs_trans *,
+                                            struct xfs_log_item *);
+void                            xfs_trans_free_item(struct xfs_trans *,
+                                            struct xfs_log_item_desc *);
+struct xfs_log_item_desc        *xfs_trans_find_item(struct xfs_trans *,
+                                             struct xfs_log_item *);
+struct xfs_log_item_desc        *xfs_trans_first_item(struct xfs_trans *);
+struct xfs_log_item_desc        *xfs_trans_next_item(struct xfs_trans *,
+                                             struct xfs_log_item_desc *);
+void                            xfs_trans_free_items(struct xfs_trans *, int);
+void                            xfs_trans_unlock_items(struct xfs_trans *,
+                                                        xfs_lsn_t);
+void                            xfs_trans_free_busy(xfs_trans_t *tp);
+xfs_log_busy_slot_t             *xfs_trans_add_busy(xfs_trans_t *tp,
+                                                    xfs_agnumber_t ag,
+                                                    xfs_extlen_t idx);
+/*
+ * From xfs_trans_ail.c
+ */
+void                    xfs_trans_update_ail(struct xfs_mount *,
+                                     struct xfs_log_item *, xfs_lsn_t,
+                                     unsigned long);
+void                    xfs_trans_delete_ail(struct xfs_mount *,
+                                     struct xfs_log_item *, unsigned long);
+struct xfs_log_item     *xfs_trans_first_ail(struct xfs_mount *, int *);
+struct xfs_log_item     *xfs_trans_next_ail(struct xfs_mount *,
+                                     struct xfs_log_item *, int *, int *);
+#endif  /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
new file mode 100644
index 000000000000..e91d173f4ed3
--- /dev/null
+++ b/fs/xfs/xfs_trans_space.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
+                (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define XFS_EXTENTADD_SPACE_RES(mp,w)   (XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+        (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+          XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+          XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_DAENTER_1B(mp,w)    ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define XFS_DAENTER_DBS(mp,w)   \
+        (XFS_DA_NODE_MAXDEPTH + \
+         ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0))
+#define XFS_DAENTER_BLOCKS(mp,w)        \
+        (XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define XFS_DAENTER_BMAP1B(mp,w)        \
+        XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define XFS_DAENTER_BMAPS(mp,w)         \
+        (XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define XFS_DAENTER_SPACE_RES(mp,w)     \
+        (XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define XFS_DAREMOVE_SPACE_RES(mp,w)    XFS_DAENTER_BMAPS(mp,w)
+#define XFS_DIRENTER_MAX_SPLIT(mp,nl)   \
+        (((mp)->m_sb.sb_blocksize == 512 && \
+          XFS_DIR_IS_V1(mp) && \
+          (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1)
+#define XFS_DIRENTER_SPACE_RES(mp,nl)   \
+        (XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+         XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define XFS_DIRREMOVE_SPACE_RES(mp)     \
+        XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define XFS_IALLOC_SPACE_RES(mp)        \
+        (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
+/*
+ * Space reservation values for various transactions.
+ */
+#define XFS_ADDAFORK_SPACE_RES(mp)      \
+        ((mp)->m_dirblkfsbs + \
+         (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)))
+#define XFS_ATTRRM_SPACE_RES(mp)        \
+        XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define XFS_ATTRSET_SPACE_RES(mp, v)    \
+        (XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define XFS_CREATE_SPACE_RES(mp,nl)     \
+        (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_DIOSTRAT_SPACE_RES(mp, v)   \
+        (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define XFS_GROWFS_SPACE_RES(mp)        \
+        (2 * XFS_AG_MAXLEVELS(mp))
+#define XFS_GROWFSRT_SPACE_RES(mp,b)    \
+        ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define XFS_LINK_SPACE_RES(mp,nl)       \
+        XFS_DIRENTER_SPACE_RES(mp,nl)
+#define XFS_MKDIR_SPACE_RES(mp,nl)      \
+        (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_QM_DQALLOC_SPACE_RES(mp)    \
+        (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+         XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
+        XFS_IALLOC_SPACE_RES(mp)
+#define XFS_REMOVE_SPACE_RES(mp)        \
+        XFS_DIRREMOVE_SPACE_RES(mp)
+#define XFS_RENAME_SPACE_RES(mp,nl)     \
+        (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_SYMLINK_SPACE_RES(mp,nl,b)  \
+        (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#endif  /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
new file mode 100644
index 000000000000..04609d27ea51
--- /dev/null
+++ b/fs/xfs/xfs_types.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TYPES_H__
+#define __XFS_TYPES_H__
+#ifdef __KERNEL__
+/*
+ * POSIX Extensions
+ */
+typedef unsigned char           uchar_t;
+typedef unsigned short          ushort_t;
+typedef unsigned int            uint_t;
+typedef unsigned long           ulong_t;
+/*
+ * Additional type declarations for XFS
+ */
+typedef signed char             __int8_t;
+typedef unsigned char           __uint8_t;
+typedef signed short int        __int16_t;
+typedef unsigned short int      __uint16_t;
+typedef signed int              __int32_t;
+typedef unsigned int            __uint32_t;
+typedef signed long long int    __int64_t;
+typedef unsigned long long int  __uint64_t;
+typedef enum { B_FALSE,B_TRUE } boolean_t;
+typedef __int64_t               prid_t;         /* project ID */
+typedef __uint32_t              inst_t;         /* an instruction */
+typedef __s64                   xfs_off_t;      /* <file offset> type */
+typedef __u64                   xfs_ino_t;      /* <inode> type */
+typedef __s64                   xfs_daddr_t;    /* <disk address> type */
+typedef char *                  xfs_caddr_t;    /* <core address> type */
+typedef __u32                   xfs_dev_t;
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+#endif  /* __KERNEL__ */
+typedef __uint32_t      xfs_agblock_t;  /* blockno in alloc. group */
+typedef __uint32_t      xfs_extlen_t;   /* extent length in blocks */
+typedef __uint32_t      xfs_agnumber_t; /* allocation group number */
+typedef __int32_t       xfs_extnum_t;   /* # of extents in a file */
+typedef __int16_t       xfs_aextnum_t;  /* # extents in an attribute fork */
+typedef __int64_t       xfs_fsize_t;    /* bytes in a file */
+typedef __uint64_t      xfs_ufsize_t;   /* unsigned bytes in a file */
+typedef __int32_t       xfs_suminfo_t;  /* type of bitmap summary info */
+typedef __int32_t       xfs_rtword_t;   /* word type for bitmap manipulations */
+typedef __int64_t       xfs_lsn_t;      /* log sequence number */
+typedef __int32_t       xfs_tid_t;      /* transaction identifier */
+typedef __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
+typedef __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
+typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
+/*
+ * These types are 64 bits on disk but are either 32 or 64 bits in memory.
+ * Disk based types:
+ */
+typedef __uint64_t      xfs_dfsbno_t;   /* blockno in filesystem (agno|agbno) */
+typedef __uint64_t      xfs_drfsbno_t;  /* blockno in filesystem (raw) */
+typedef __uint64_t      xfs_drtbno_t;   /* extent (block) in realtime area */
+typedef __uint64_t      xfs_dfiloff_t;  /* block number in a file */
+typedef __uint64_t      xfs_dfilblks_t; /* number of blocks in a file */
+/*
+ * Memory based types are conditional.
+ */
+#if XFS_BIG_BLKNOS
+typedef __uint64_t      xfs_fsblock_t;  /* blockno in filesystem (agno|agbno) */
+typedef __uint64_t      xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint64_t      xfs_rtblock_t;  /* extent (block) in realtime area */
+typedef __int64_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#else
+typedef __uint32_t      xfs_fsblock_t;  /* blockno in filesystem (agno|agbno) */
+typedef __uint32_t      xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint32_t      xfs_rtblock_t;  /* extent (block) in realtime area */
+typedef __int32_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#endif
+typedef __uint64_t      xfs_fileoff_t;  /* block number in a file */
+typedef __int64_t       xfs_sfiloff_t;  /* signed block number in a file */
+typedef __uint64_t      xfs_filblks_t;  /* number of blocks in a file */
+typedef __uint8_t       xfs_arch_t;     /* architecture of an xfs fs */
+/*
+ * Null values for the types.
+ */
+#define NULLDFSBNO      ((xfs_dfsbno_t)-1)
+#define NULLDRFSBNO     ((xfs_drfsbno_t)-1)
+#define NULLDRTBNO      ((xfs_drtbno_t)-1)
+#define NULLDFILOFF     ((xfs_dfiloff_t)-1)
+#define NULLFSBLOCK     ((xfs_fsblock_t)-1)
+#define NULLRFSBLOCK    ((xfs_rfsblock_t)-1)
+#define NULLRTBLOCK     ((xfs_rtblock_t)-1)
+#define NULLFILEOFF     ((xfs_fileoff_t)-1)
+#define NULLAGBLOCK     ((xfs_agblock_t)-1)
+#define NULLAGNUMBER    ((xfs_agnumber_t)-1)
+#define NULLEXTNUM      ((xfs_extnum_t)-1)
+#define NULLCOMMITLSN   ((xfs_lsn_t)-1)
+/*
+ * Max values for extlen, extnum, aextnum.
+ */
+#define MAXEXTLEN       ((xfs_extlen_t)0x001fffff)      /* 21 bits */
+#define MAXEXTNUM       ((xfs_extnum_t)0x7fffffff)      /* signed int */
+#define MAXAEXTNUM      ((xfs_aextnum_t)0x7fff)         /* signed short */
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN      256
+typedef struct xfs_dirent {             /* data from readdir() */
+        xfs_ino_t       d_ino;          /* inode number of entry */
+        xfs_off_t       d_off;          /* offset of disk directory entry */
+        unsigned short  d_reclen;       /* length of this record */
+        char            d_name[1];      /* name of file */
+} xfs_dirent_t;
+#define DIRENTBASESIZE          (((xfs_dirent_t *)0)->d_name - (char *)0)
+#define DIRENTSIZE(namelen)     \
+        ((DIRENTBASESIZE + (namelen) + \
+                sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1))
+typedef enum {
+        XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+typedef enum {
+        XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
+        XFS_BTNUM_MAX
+} xfs_btnum_t;
+#endif  /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
new file mode 100644
index 000000000000..816b945fa0ea
--- /dev/null
+++ b/fs/xfs/xfs_utils.c
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
+#include "xfs_rw.h"
+#include "xfs_itable.h"
+#include "xfs_utils.h"
+/*
+ * xfs_get_dir_entry is used to get a reference to an inode given
+ * its parent directory inode and the name of the file.  It does
+ * not lock the child inode, and it unlocks the directory before
+ * returning.  The directory's generation number is returned for
+ * use by a later call to xfs_lock_dir_and_entry.
+ */
+int
+xfs_get_dir_entry(
+        vname_t         *dentry,
+        xfs_inode_t     **ipp)
+{
+        vnode_t         *vp;
+        bhv_desc_t      *bdp;
+        vp = VNAME_TO_VNODE(dentry);
+        bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
+        if (!bdp) {
+                *ipp = NULL;
+                return XFS_ERROR(ENOENT);
+        }
+        VN_HOLD(vp);
+        *ipp = XFS_BHVTOI(bdp);
+        return 0;
+}
+int
+xfs_dir_lookup_int(
+        bhv_desc_t      *dir_bdp,
+        uint            lock_mode,
+        vname_t         *dentry,
+        xfs_ino_t       *inum,
+        xfs_inode_t     **ipp)
+{
+        vnode_t         *dir_vp;
+        xfs_inode_t     *dp;
+        int             error;
+        dir_vp = BHV_TO_VNODE(dir_bdp);
+        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        dp = XFS_BHVTOI(dir_bdp);
+        error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp,
+                                VNAME(dentry), VNAMELEN(dentry), inum);
+        if (!error) {
+                /*
+                 * Unlock the directory. We do this because we can't
+                 * hold the directory lock while doing the vn_get()
+                 * in xfs_iget().  Doing so could cause us to hold
+                 * a lock while waiting for the inode to finish
+                 * being inactive while it's waiting for a log
+                 * reservation in the inactive routine.
+                 */
+                xfs_iunlock(dp, lock_mode);
+                error = xfs_iget(dp->i_mount, NULL, *inum, 0, 0, ipp, 0);
+                xfs_ilock(dp, lock_mode);
+                if (error) {
+                        *ipp = NULL;
+                } else if ((*ipp)->i_d.di_mode == 0) {
+                        /*
+                         * The inode has been freed.  Something is
+                         * wrong so just get out of here.
+                         */
+                        xfs_iunlock(dp, lock_mode);
+                        xfs_iput_new(*ipp, 0);
+                        *ipp = NULL;
+                        xfs_ilock(dp, lock_mode);
+                        error = XFS_ERROR(ENOENT);
+                }
+        }
+        return error;
+}
+/*
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
+ */
+int
+xfs_dir_ialloc(
+        xfs_trans_t     **tpp,          /* input: current transaction;
+                                           output: may be a new transaction. */
+        xfs_inode_t     *dp,            /* directory within whose allocate
+                                           the inode. */
+        mode_t          mode,
+        nlink_t         nlink,
+        xfs_dev_t       rdev,
+        cred_t          *credp,
+        prid_t          prid,           /* project id */
+        int             okalloc,        /* ok to allocate new space */
+        xfs_inode_t     **ipp,          /* pointer to inode; it will be
+                                           locked. */
+        int             *committed)
+{
+        xfs_trans_t     *tp;
+        xfs_trans_t     *ntp;
+        xfs_inode_t     *ip;
+        xfs_buf_t       *ialloc_context = NULL;
+        boolean_t       call_again = B_FALSE;
+        int             code;
+        uint            log_res;
+        uint            log_count;
+        void            *dqinfo;
+        uint            tflags;
+        tp = *tpp;
+        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+        /*
+         * xfs_ialloc will return a pointer to an incore inode if
+         * the Space Manager has an available inode on the free
+         * list. Otherwise, it will do an allocation and replenish
+         * the freelist.  Since we can only do one allocation per
+         * transaction without deadlocks, we will need to commit the
+         * current transaction and start a new one.  We will then
+         * need to call xfs_ialloc again to get the inode.
+         *
+         * If xfs_ialloc did an allocation to replenish the freelist,
+         * it returns the bp containing the head of the freelist as
+         * ialloc_context. We will hold a lock on it across the
+         * transaction commit so that no other process can steal
+         * the inode(s) that we've just allocated.
+         */
+        code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+                          &ialloc_context, &call_again, &ip);
+        /*
+         * Return an error if we were unable to allocate a new inode.
+         * This should only happen if we run out of space on disk or
+         * encounter a disk error.
+         */
+        if (code) {
+                *ipp = NULL;
+                return code;
+        }
+        if (!call_again && (ip == NULL)) {
+                *ipp = NULL;
+                return XFS_ERROR(ENOSPC);
+        }
+        /*
+         * If call_again is set, then we were unable to get an
+         * inode in one operation.  We need to commit the current
+         * transaction and call xfs_ialloc() again.  It is guaranteed
+         * to succeed the second time.
+         */
+        if (call_again) {
+                /*
+                 * Normally, xfs_trans_commit releases all the locks.
+                 * We call bhold to hang on to the ialloc_context across
+                 * the commit.  Holding this buffer prevents any other
+                 * processes from doing any allocations in this
+                 * allocation group.
+                 */
+                xfs_trans_bhold(tp, ialloc_context);
+                /*
+                 * Save the log reservation so we can use
+                 * them in the next transaction.
+                 */
+                log_res = xfs_trans_get_log_res(tp);
+                log_count = xfs_trans_get_log_count(tp);
+                /*
+                 * We want the quota changes to be associated with the next
+                 * transaction, NOT this one. So, detach the dqinfo from this
+                 * and attach it to the next transaction.
+                 */
+                dqinfo = NULL;
+                tflags = 0;
+                if (tp->t_dqinfo) {
+                        dqinfo = (void *)tp->t_dqinfo;
+                        tp->t_dqinfo = NULL;
+                        tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+                        tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+                }
+                ntp = xfs_trans_dup(tp);
+                code = xfs_trans_commit(tp, 0, NULL);
+                tp = ntp;
+                if (committed != NULL) {
+                        *committed = 1;
+                }
+                /*
+                 * If we get an error during the commit processing,
+                 * release the buffer that is still held and return
+                 * to the caller.
+                 */
+                if (code) {
+                        xfs_buf_relse(ialloc_context);
+                        if (dqinfo) {
+                                tp->t_dqinfo = dqinfo;
+                                XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+                        }
+                        *tpp = ntp;
+                        *ipp = NULL;
+                        return code;
+                }
+                code = xfs_trans_reserve(tp, 0, log_res, 0,
+                                         XFS_TRANS_PERM_LOG_RES, log_count);
+                /*
+                 * Re-attach the quota info that we detached from prev trx.
+                 */
+                if (dqinfo) {
+                        tp->t_dqinfo = dqinfo;
+                        tp->t_flags |= tflags;
+                }
+                if (code) {
+                        xfs_buf_relse(ialloc_context);
+                        *tpp = ntp;
+                        *ipp = NULL;
+                        return code;
+                }
+                xfs_trans_bjoin(tp, ialloc_context);
+                /*
+                 * Call ialloc again. Since we've locked out all
+                 * other allocations in this allocation group,
+                 * this call should always succeed.
+                 */
+                code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+                                  okalloc, &ialloc_context, &call_again, &ip);
+                /*
+                 * If we get an error at this point, return to the caller
+                 * so that the current transaction can be aborted.
+                 */
+                if (code) {
+                        *tpp = tp;
+                        *ipp = NULL;
+                        return code;
+                }
+                ASSERT ((!call_again) && (ip != NULL));
+        } else {
+                if (committed != NULL) {
+                        *committed = 0;
+                }
+        }
+        *ipp = ip;
+        *tpp = tp;
+        return 0;
+}
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int                             /* error */
+xfs_droplink(
+        xfs_trans_t *tp,
+        xfs_inode_t *ip)
+{
+        int     error;
+        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        ASSERT (ip->i_d.di_nlink > 0);
+        ip->i_d.di_nlink--;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        error = 0;
+        if (ip->i_d.di_nlink == 0) {
+                /*
+                 * We're dropping the last link to this file.
+                 * Move the on-disk inode to the AGI unlinked list.
+                 * From xfs_inactive() we will pull the inode from
+                 * the list and free it.
+                 */
+                error = xfs_iunlink(tp, ip);
+        }
+        return error;
+}
+/*
+ * This gets called when the inode's version needs to be changed from 1 to 2.
+ * Currently this happens when the nlink field overflows the old 16-bit value
+ * or when chproj is called to change the project for the first time.
+ * As a side effect the superblock version will also get rev'd
+ * to contain the NLINK bit.
+ */
+void
+xfs_bump_ino_vers2(
+        xfs_trans_t     *tp,
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp;
+        unsigned long           s;
+        ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+        ip->i_d.di_onlink = 0;
+        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
+        mp = tp->t_mountp;
+        if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+                s = XFS_SB_LOCK(mp);
+                if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+                        XFS_SB_VERSION_ADDNLINK(&mp->m_sb);
+                        XFS_SB_UNLOCK(mp, s);
+                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+                } else {
+                        XFS_SB_UNLOCK(mp, s);
+                }
+        }
+        /* Caller must log the inode */
+}
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+        xfs_trans_t *tp,
+        xfs_inode_t *ip)
+{
+        if (ip->i_d.di_nlink >= XFS_MAXLINK)
+                return XFS_ERROR(EMLINK);
+        xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+        ASSERT(ip->i_d.di_nlink > 0);
+        ip->i_d.di_nlink++;
+        if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
+                /*
+                 * The inode has increased its number of links beyond
+                 * what can fit in an old format inode.  It now needs
+                 * to be converted to a version 2 inode with a 32 bit
+                 * link count.  If this is the first inode in the file
+                 * system to do this, then we need to bump the superblock
+                 * version number as well.
+                 */
+                xfs_bump_ino_vers2(tp, ip);
+        }
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        return 0;
+}
+/*
+ * Try to truncate the given file to 0 length.  Currently called
+ * only out of xfs_remove when it has to truncate a file to free
+ * up space for the remove to proceed.
+ */
+int
+xfs_truncate_file(
+        xfs_mount_t     *mp,
+        xfs_inode_t     *ip)
+{
+        xfs_trans_t     *tp;
+        int             error;
+#ifdef QUOTADEBUG
+        /*
+         * This is called to truncate the quotainodes too.
+         */
+        if (XFS_IS_UQUOTA_ON(mp)) {
+                if (ip->i_ino != mp->m_sb.sb_uquotino)
+                        ASSERT(ip->i_udquot);
+        }
+        if (XFS_IS_GQUOTA_ON(mp)) {
+                if (ip->i_ino != mp->m_sb.sb_gquotino)
+                        ASSERT(ip->i_gdquot);
+        }
+#endif
+        /*
+         * Make the call to xfs_itruncate_start before starting the
+         * transaction, because we cannot make the call while we're
+         * in a transaction.
+         */
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+        if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                      XFS_TRANS_PERM_LOG_RES,
+                                      XFS_ITRUNCATE_LOG_COUNT))) {
+                xfs_trans_cancel(tp, 0);
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                return error;
+        }
+        /*
+         * Follow the normal truncate locking protocol.  Since we
+         * hold the inode in the transaction, we know that it's number
+         * of references will stay constant.
+         */
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        /*
+         * Signal a sync xaction.  The only case where that isn't
+         * the case is if we're truncating an already unlinked file
+         * on a wsync fs.  In that case, we know the blocks can't
+         * reappear in the file because the links to file are
+         * permanently toast.  Currently, we're always going to
+         * want a sync transaction because this code is being
+         * called from places where nlink is guaranteed to be 1
+         * but I'm leaving the tests in to protect against future
+         * changes -- rcc.
+         */
+        error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
+                                     XFS_DATA_FORK,
+                                     ((ip->i_d.di_nlink != 0 ||
+                                       !(mp->m_flags & XFS_MOUNT_WSYNC))
+                                      ? 1 : 0));
+        if (error) {
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                 XFS_TRANS_ABORT);
+        } else {
+                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+                                         NULL);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        return error;
+}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
new file mode 100644
index 000000000000..e1ed6a588000
--- /dev/null
+++ b/fs/xfs/xfs_utils.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_UTILS_H__
+#define __XFS_UTILS_H__
+#define IRELE(ip)       VN_RELE(XFS_ITOV(ip))
+#define IHOLD(ip)       VN_HOLD(XFS_ITOV(ip))
+#define ITRACE(ip)      vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
+                                (inst_t *)__return_address)
+extern int xfs_rename (bhv_desc_t *, vname_t *, vnode_t *, vname_t *, cred_t *);
+extern int xfs_get_dir_entry (vname_t *, xfs_inode_t **);
+extern int xfs_dir_lookup_int (bhv_desc_t *, uint, vname_t *, xfs_ino_t *,
+                                xfs_inode_t **);
+extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *);
+extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, nlink_t,
+                                xfs_dev_t, cred_t *, prid_t, int,
+                                xfs_inode_t **, int *);
+extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *);
+extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *);
+extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *);
+#endif  /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
new file mode 100644
index 000000000000..00aae9c6a904
--- /dev/null
+++ b/fs/xfs/xfs_vfsops.c
@@ -0,0 +1,1941 @@
+/*
+ * XFS filesystem operations.
+ *
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_rw.h"
+#include "xfs_refcache.h"
+#include "xfs_buf_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_quota.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_clnt.h"
+#include "xfs_log_priv.h"
+STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
+int
+xfs_init(void)
+{
+        extern kmem_zone_t      *xfs_bmap_free_item_zone;
+        extern kmem_zone_t      *xfs_btree_cur_zone;
+        extern kmem_zone_t      *xfs_trans_zone;
+        extern kmem_zone_t      *xfs_buf_item_zone;
+        extern kmem_zone_t      *xfs_dabuf_zone;
+#ifdef XFS_DABUF_DEBUG
+        extern lock_t           xfs_dabuf_global_lock;
+        spinlock_init(&xfs_dabuf_global_lock, "xfsda");
+#endif
+        /*
+         * Initialize all of the zone allocators we use.
+         */
+        xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+                                                 "xfs_bmap_free_item");
+        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+                                            "xfs_btree_cur");
+        xfs_inode_zone = kmem_zone_init(sizeof(xfs_inode_t), "xfs_inode");
+        xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+        xfs_da_state_zone =
+                kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
+        xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+        /*
+         * The size of the zone allocated buf log item is the maximum
+         * size possible under XFS.  This wastes a little bit of memory,
+         * but it is much faster.
+         */
+        xfs_buf_item_zone =
+                kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+                                (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+                                  NBWORD) * sizeof(int))),
+                               "xfs_buf_item");
+        xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+                                       ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
+                                      "xfs_efd_item");
+        xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+                                       ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
+                                      "xfs_efi_item");
+        xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+        xfs_ili_zone = kmem_zone_init(sizeof(xfs_inode_log_item_t), "xfs_ili");
+        xfs_chashlist_zone = kmem_zone_init(sizeof(xfs_chashlist_t),
+                                            "xfs_chashlist");
+        xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
+        /*
+         * Allocate global trace buffers.
+         */
+#ifdef XFS_ALLOC_TRACE
+        xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMAP_TRACE
+        xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMBT_TRACE
+        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR_TRACE
+        xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_ATTR_TRACE
+        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR2_TRACE
+        xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP);
+#endif
+        xfs_dir_startup();
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+        xfs_error_test_init();
+#endif /* DEBUG || INDUCE_IO_ERROR */
+        xfs_init_procfs();
+        xfs_sysctl_register();
+        return 0;
+}
+void
+xfs_cleanup(void)
+{
+        extern kmem_zone_t      *xfs_bmap_free_item_zone;
+        extern kmem_zone_t      *xfs_btree_cur_zone;
+        extern kmem_zone_t      *xfs_inode_zone;
+        extern kmem_zone_t      *xfs_trans_zone;
+        extern kmem_zone_t      *xfs_da_state_zone;
+        extern kmem_zone_t      *xfs_dabuf_zone;
+        extern kmem_zone_t      *xfs_efd_zone;
+        extern kmem_zone_t      *xfs_efi_zone;
+        extern kmem_zone_t      *xfs_buf_item_zone;
+        extern kmem_zone_t      *xfs_chashlist_zone;
+        xfs_cleanup_procfs();
+        xfs_sysctl_unregister();
+        xfs_refcache_destroy();
+        xfs_acl_zone_destroy(xfs_acl_zone);
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(xfs_dir2_trace_buf);
+#endif
+#ifdef XFS_ATTR_TRACE
+        ktrace_free(xfs_attr_trace_buf);
+#endif
+#ifdef XFS_DIR_TRACE
+        ktrace_free(xfs_dir_trace_buf);
+#endif
+#ifdef XFS_BMBT_TRACE
+        ktrace_free(xfs_bmbt_trace_buf);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(xfs_bmap_trace_buf);
+#endif
+#ifdef XFS_ALLOC_TRACE
+        ktrace_free(xfs_alloc_trace_buf);
+#endif
+        kmem_cache_destroy(xfs_bmap_free_item_zone);
+        kmem_cache_destroy(xfs_btree_cur_zone);
+        kmem_cache_destroy(xfs_inode_zone);
+        kmem_cache_destroy(xfs_trans_zone);
+        kmem_cache_destroy(xfs_da_state_zone);
+        kmem_cache_destroy(xfs_dabuf_zone);
+        kmem_cache_destroy(xfs_buf_item_zone);
+        kmem_cache_destroy(xfs_efd_zone);
+        kmem_cache_destroy(xfs_efi_zone);
+        kmem_cache_destroy(xfs_ifork_zone);
+        kmem_cache_destroy(xfs_ili_zone);
+        kmem_cache_destroy(xfs_chashlist_zone);
+}
+/*
+ * xfs_start_flags
+ *
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ */
+STATIC int
+xfs_start_flags(
+        struct vfs              *vfs,
+        struct xfs_mount_args   *ap,
+        struct xfs_mount        *mp)
+{
+        /* Values are in BBs */
+        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+                /*
+                 * At this point the superblock has not been read
+                 * in, therefore we do not know the block size.
+                 * Before the mount call ends we will convert
+                 * these to FSBs.
+                 */
+                mp->m_dalign = ap->sunit;
+                mp->m_swidth = ap->swidth;
+        }
+        if (ap->logbufs != -1 &&
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+            ap->logbufs != 0 &&
+#endif
+            (ap->logbufs < XLOG_MIN_ICLOGS ||
+             ap->logbufs > XLOG_MAX_ICLOGS)) {
+                cmn_err(CE_WARN,
+                        "XFS: invalid logbufs value: %d [not %d-%d]",
+                        ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+                return XFS_ERROR(EINVAL);
+        }
+        mp->m_logbufs = ap->logbufs;
+        if (ap->logbufsize != -1 &&
+            ap->logbufsize != 16 * 1024 &&
+            ap->logbufsize != 32 * 1024 &&
+            ap->logbufsize != 64 * 1024 &&
+            ap->logbufsize != 128 * 1024 &&
+            ap->logbufsize != 256 * 1024) {
+                cmn_err(CE_WARN,
+        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        ap->logbufsize);
+                return XFS_ERROR(EINVAL);
+        }
+        mp->m_ihsize = ap->ihashsize;
+        mp->m_logbsize = ap->logbufsize;
+        mp->m_fsname_len = strlen(ap->fsname) + 1;
+        mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
+        strcpy(mp->m_fsname, ap->fsname);
+        if (ap->flags & XFSMNT_WSYNC)
+                mp->m_flags |= XFS_MOUNT_WSYNC;
+#if XFS_BIG_INUMS
+        if (ap->flags & XFSMNT_INO64) {
+                mp->m_flags |= XFS_MOUNT_INO64;
+                mp->m_inoadd = XFS_INO64_OFFSET;
+        }
+#endif
+        if (ap->flags & XFSMNT_NOATIME)
+                mp->m_flags |= XFS_MOUNT_NOATIME;
+        if (ap->flags & XFSMNT_RETERR)
+                mp->m_flags |= XFS_MOUNT_RETERR;
+        if (ap->flags & XFSMNT_NOALIGN)
+                mp->m_flags |= XFS_MOUNT_NOALIGN;
+        if (ap->flags & XFSMNT_SWALLOC)
+                mp->m_flags |= XFS_MOUNT_SWALLOC;
+        if (ap->flags & XFSMNT_OSYNCISOSYNC)
+                mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
+        if (ap->flags & XFSMNT_32BITINODES)
+                mp->m_flags |= (XFS_MOUNT_32BITINODES | XFS_MOUNT_32BITINOOPT);
+        if (ap->flags & XFSMNT_IOSIZE) {
+                if (ap->iosizelog > XFS_MAX_IO_LOG ||
+                    ap->iosizelog < XFS_MIN_IO_LOG) {
+                        cmn_err(CE_WARN,
+                "XFS: invalid log iosize: %d [not %d-%d]",
+                                ap->iosizelog, XFS_MIN_IO_LOG,
+                                XFS_MAX_IO_LOG);
+                        return XFS_ERROR(EINVAL);
+                }
+                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+                mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
+        }
+        if (ap->flags & XFSMNT_IHASHSIZE)
+                mp->m_flags |= XFS_MOUNT_IHASHSIZE;
+        if (ap->flags & XFSMNT_IDELETE)
+                mp->m_flags |= XFS_MOUNT_IDELETE;
+        if (ap->flags & XFSMNT_DIRSYNC)
+                mp->m_flags |= XFS_MOUNT_DIRSYNC;
+        /*
+         * no recovery flag requires a read-only mount
+         */
+        if (ap->flags & XFSMNT_NORECOVERY) {
+                if (!(vfs->vfs_flag & VFS_RDONLY)) {
+                        cmn_err(CE_WARN,
+        "XFS: tried to mount a FS read-write without recovery!");
+                        return XFS_ERROR(EINVAL);
+                }
+                mp->m_flags |= XFS_MOUNT_NORECOVERY;
+        }
+        if (ap->flags & XFSMNT_NOUUID)
+                mp->m_flags |= XFS_MOUNT_NOUUID;
+        if (ap->flags & XFSMNT_NOLOGFLUSH)
+                mp->m_flags |= XFS_MOUNT_NOLOGFLUSH;
+        return 0;
+}
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+        struct vfs              *vfs,
+        struct xfs_mount_args   *ap,
+        struct xfs_mount        *mp)
+{
+        int                     ronly = (vfs->vfs_flag & VFS_RDONLY);
+        /* Fail a mount where the logbuf is smaller then the log stripe */
+        if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+                if ((ap->logbufsize == -1) &&
+                    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+                        mp->m_logbsize = mp->m_sb.sb_logsunit;
+                } else if (ap->logbufsize < mp->m_sb.sb_logsunit) {
+                        cmn_err(CE_WARN,
+        "XFS: logbuf size must be greater than or equal to log stripe size");
+                        return XFS_ERROR(EINVAL);
+                }
+        } else {
+                /* Fail a mount if the logbuf is larger than 32K */
+                if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+                        cmn_err(CE_WARN,
+        "XFS: logbuf size for version 1 logs must be 16K or 32K");
+                        return XFS_ERROR(EINVAL);
+                }
+        }
+        /*
+         * prohibit r/w mounts of read-only filesystems
+         */
+        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+                cmn_err(CE_WARN,
+        "XFS: cannot mount a read-only filesystem as read-write");
+                return XFS_ERROR(EROFS);
+        }
+        /*
+         * disallow mount attempts with (IRIX) project quota enabled
+         */
+        if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+            (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT)) {
+                cmn_err(CE_WARN,
+        "XFS: cannot mount a filesystem with IRIX project quota enabled");
+                return XFS_ERROR(ENOSYS);
+        }
+        /*
+         * check for shared mount.
+         */
+        if (ap->flags & XFSMNT_SHARED) {
+                if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb))
+                        return XFS_ERROR(EINVAL);
+                /*
+                 * For IRIX 6.5, shared mounts must have the shared
+                 * version bit set, have the persistent readonly
+                 * field set, must be version 0 and can only be mounted
+                 * read-only.
+                 */
+                if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
+                     (mp->m_sb.sb_shared_vn != 0))
+                        return XFS_ERROR(EINVAL);
+                mp->m_flags |= XFS_MOUNT_SHARED;
+                /*
+                 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
+                 */
+                if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
+                        return XFS_ERROR(EINVAL);
+        }
+        return 0;
+}
+/*
+ * xfs_mount
+ *
+ * The file system configurations are:
+ *      (1) device (partition) with data and internal log
+ *      (2) logical volume with data and log subvolumes.
+ *      (3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present.  The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev.
+ */
+STATIC int
+xfs_mount(
+        struct bhv_desc         *bhvp,
+        struct xfs_mount_args   *args,
+        cred_t                  *credp)
+{
+        struct vfs              *vfsp = bhvtovfs(bhvp);
+        struct bhv_desc         *p;
+        struct xfs_mount        *mp = XFS_BHVTOM(bhvp);
+        struct block_device     *ddev, *logdev, *rtdev;
+        int                     flags = 0, error;
+        ddev = vfsp->vfs_super->s_bdev;
+        logdev = rtdev = NULL;
+        /*
+         * Setup xfs_mount function vectors from available behaviors
+         */
+        p = vfs_bhv_lookup(vfsp, VFS_POSITION_DM);
+        mp->m_dm_ops = p ? *(xfs_dmops_t *) vfs_bhv_custom(p) : xfs_dmcore_stub;
+        p = vfs_bhv_lookup(vfsp, VFS_POSITION_QM);
+        mp->m_qm_ops = p ? *(xfs_qmops_t *) vfs_bhv_custom(p) : xfs_qmcore_stub;
+        p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO);
+        mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs;
+        /*
+         * Open real time and log devices - order is important.
+         */
+        if (args->logname[0]) {
+                error = xfs_blkdev_get(mp, args->logname, &logdev);
+                if (error)
+                        return error;
+        }
+        if (args->rtname[0]) {
+                error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+                if (error) {
+                        xfs_blkdev_put(logdev);
+                        return error;
+                }
+                if (rtdev == ddev || rtdev == logdev) {
+                        cmn_err(CE_WARN,
+        "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+                        xfs_blkdev_put(logdev);
+                        xfs_blkdev_put(rtdev);
+                        return EINVAL;
+                }
+        }
+        /*
+         * Setup xfs_mount buffer target pointers
+         */
+        error = ENOMEM;
+        mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+        if (!mp->m_ddev_targp) {
+                xfs_blkdev_put(logdev);
+                xfs_blkdev_put(rtdev);
+                return error;
+        }
+        if (rtdev) {
+                mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+                if (!mp->m_rtdev_targp)
+                        goto error0;
+        }
+        mp->m_logdev_targp = (logdev && logdev != ddev) ?
+                                xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp;
+        if (!mp->m_logdev_targp)
+                goto error0;
+        /*
+         * Setup flags based on mount(2) options and then the superblock
+         */
+        error = xfs_start_flags(vfsp, args, mp);
+        if (error)
+                goto error1;
+        error = xfs_readsb(mp);
+        if (error)
+                goto error1;
+        error = xfs_finish_flags(vfsp, args, mp);
+        if (error)
+                goto error2;
+        /*
+         * Setup xfs_mount buffer target pointers based on superblock
+         */
+        error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+                                    mp->m_sb.sb_sectsize);
+        if (!error && logdev && logdev != ddev) {
+                unsigned int    log_sector_size = BBSIZE;
+                if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb))
+                        log_sector_size = mp->m_sb.sb_logsectsize;
+                error = xfs_setsize_buftarg(mp->m_logdev_targp,
+                                            mp->m_sb.sb_blocksize,
+                                            log_sector_size);
+        }
+        if (!error && rtdev)
+                error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+                                            mp->m_sb.sb_blocksize,
+                                            mp->m_sb.sb_sectsize);
+        if (error)
+                goto error2;
+        error = XFS_IOINIT(vfsp, args, flags);
+        if (!error)
+                return 0;
+error2:
+        if (mp->m_sb_bp)
+                xfs_freesb(mp);
+error1:
+        xfs_binval(mp->m_ddev_targp);
+        if (logdev && logdev != ddev)
+                xfs_binval(mp->m_logdev_targp);
+        if (rtdev)
+                xfs_binval(mp->m_rtdev_targp);
+error0:
+        xfs_unmountfs_close(mp, credp);
+        return error;
+}
+STATIC int
+xfs_unmount(
+        bhv_desc_t      *bdp,
+        int             flags,
+        cred_t          *credp)
+{
+        struct vfs      *vfsp = bhvtovfs(bdp);
+        xfs_mount_t     *mp = XFS_BHVTOM(bdp);
+        xfs_inode_t     *rip;
+        vnode_t         *rvp;
+        int             unmount_event_wanted = 0;
+        int             unmount_event_flags = 0;
+        int             xfs_unmountfs_needed = 0;
+        int             error;
+        rip = mp->m_rootip;
+        rvp = XFS_ITOV(rip);
+        if (vfsp->vfs_flag & VFS_DMI) {
+                error = XFS_SEND_PREUNMOUNT(mp, vfsp,
+                                rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL,
+                                NULL, NULL, 0, 0,
+                                (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
+                                        0:DM_FLAGS_UNWANTED);
+                        if (error)
+                                return XFS_ERROR(error);
+                unmount_event_wanted = 1;
+                unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))?
+                                        0 : DM_FLAGS_UNWANTED;
+        }
+        /*
+         * First blow any referenced inode from this file system
+         * out of the reference cache, and delete the timer.
+         */
+        xfs_refcache_purge_mp(mp);
+        XFS_bflush(mp->m_ddev_targp);
+        error = xfs_unmount_flush(mp, 0);
+        if (error)
+                goto out;
+        ASSERT(vn_count(rvp) == 1);
+        /*
+         * Drop the reference count
+         */
+        VN_RELE(rvp);
+        /*
+         * If we're forcing a shutdown, typically because of a media error,
+         * we want to make sure we invalidate dirty pages that belong to
+         * referenced vnodes as well.
+         */
+        if (XFS_FORCED_SHUTDOWN(mp)) {
+                error = xfs_sync(&mp->m_bhv,
+                         (SYNC_WAIT | SYNC_CLOSE), credp);
+                ASSERT(error != EFSCORRUPTED);
+        }
+        xfs_unmountfs_needed = 1;
+out:
+        /*      Send DMAPI event, if required.
+         *      Then do xfs_unmountfs() if needed.
+         *      Then return error (or zero).
+         */
+        if (unmount_event_wanted) {
+                /* Note: mp structure must still exist for
+                 * XFS_SEND_UNMOUNT() call.
+                 */
+                XFS_SEND_UNMOUNT(mp, vfsp, error == 0 ? rvp : NULL,
+                        DM_RIGHT_NULL, 0, error, unmount_event_flags);
+        }
+        if (xfs_unmountfs_needed) {
+                /*
+                 * Call common unmount function to flush to disk
+                 * and free the super block buffer & mount structures.
+                 */
+                xfs_unmountfs(mp, credp);
+        }
+        return XFS_ERROR(error);
+}
+#define REMOUNT_READONLY_FLAGS  (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
+STATIC int
+xfs_mntupdate(
+        bhv_desc_t                      *bdp,
+        int                             *flags,
+        struct xfs_mount_args           *args)
+{
+        struct vfs      *vfsp = bhvtovfs(bdp);
+        xfs_mount_t     *mp = XFS_BHVTOM(bdp);
+        int             pincount, error;
+        int             count = 0;
+        if (args->flags & XFSMNT_NOATIME)
+                mp->m_flags |= XFS_MOUNT_NOATIME;
+        else
+                mp->m_flags &= ~XFS_MOUNT_NOATIME;
+        if (!(vfsp->vfs_flag & VFS_RDONLY)) {
+                VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error);
+        }
+        if (*flags & MS_RDONLY) {
+                xfs_refcache_purge_mp(mp);
+                xfs_flush_buftarg(mp->m_ddev_targp, 0);
+                xfs_finish_reclaim_all(mp, 0);
+                /* This loop must run at least twice.
+                 * The first instance of the loop will flush
+                 * most meta data but that will generate more
+                 * meta data (typically directory updates).
+                 * Which then must be flushed and logged before
+                 * we can write the unmount record.
+                 */ 
+                do {
+                        VFS_SYNC(vfsp, REMOUNT_READONLY_FLAGS, NULL, error);
+                        pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+                        if (!pincount) {
+                                delay(50);
+                                count++;
+                        }
+                } while (count < 2);
+                /* Ok now write out an unmount record */
+                xfs_log_unmount_write(mp);
+                xfs_unmountfs_writesb(mp);
+                vfsp->vfs_flag |= VFS_RDONLY;
+        } else {
+                vfsp->vfs_flag &= ~VFS_RDONLY;
+        }
+        return 0;
+}
+/*
+ * xfs_unmount_flush implements a set of flush operation on special
+ * inodes, which are needed as a separate set of operations so that
+ * they can be called as part of relocation process.
+ */
+int
+xfs_unmount_flush(
+        xfs_mount_t     *mp,            /* Mount structure we are getting
+                                           rid of. */
+        int             relocation)     /* Called from vfs relocation. */
+{
+        xfs_inode_t     *rip = mp->m_rootip;
+        xfs_inode_t     *rbmip;
+        xfs_inode_t     *rsumip = NULL;
+        vnode_t         *rvp = XFS_ITOV(rip);
+        int             error;
+        xfs_ilock(rip, XFS_ILOCK_EXCL);
+        xfs_iflock(rip);
+        /*
+         * Flush out the real time inodes.
+         */
+        if ((rbmip = mp->m_rbmip) != NULL) {
+                xfs_ilock(rbmip, XFS_ILOCK_EXCL);
+                xfs_iflock(rbmip);
+                error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
+                xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
+                if (error == EFSCORRUPTED)
+                        goto fscorrupt_out;
+                ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+                rsumip = mp->m_rsumip;
+                xfs_ilock(rsumip, XFS_ILOCK_EXCL);
+                xfs_iflock(rsumip);
+                error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
+                xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
+                if (error == EFSCORRUPTED)
+                        goto fscorrupt_out;
+                ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+        }
+        /*
+         * Synchronously flush root inode to disk
+         */
+        error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
+        if (error == EFSCORRUPTED)
+                goto fscorrupt_out2;
+        if (vn_count(rvp) != 1 && !relocation) {
+                xfs_iunlock(rip, XFS_ILOCK_EXCL);
+                return XFS_ERROR(EBUSY);
+        }
+        /*
+         * Release dquot that rootinode, rbmino and rsumino might be holding,
+         * flush and purge the quota inodes.
+         */
+        error = XFS_QM_UNMOUNT(mp);
+        if (error == EFSCORRUPTED)
+                goto fscorrupt_out2;
+        if (rbmip) {
+                VN_RELE(XFS_ITOV(rbmip));
+                VN_RELE(XFS_ITOV(rsumip));
+        }
+        xfs_iunlock(rip, XFS_ILOCK_EXCL);
+        return 0;
+fscorrupt_out:
+        xfs_ifunlock(rip);
+fscorrupt_out2:
+        xfs_iunlock(rip, XFS_ILOCK_EXCL);
+        return XFS_ERROR(EFSCORRUPTED);
+}
+/*
+ * xfs_root extracts the root vnode from a vfs.
+ *
+ * vfsp -- the vfs struct for the desired file system
+ * vpp  -- address of the caller's vnode pointer which should be
+ *         set to the desired fs root vnode
+ */
+STATIC int
+xfs_root(
+        bhv_desc_t      *bdp,
+        vnode_t         **vpp)
+{
+        vnode_t         *vp;
+        vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip);
+        VN_HOLD(vp);
+        *vpp = vp;
+        return 0;
+}
+/*
+ * xfs_statvfs
+ *
+ * Fill in the statvfs structure for the given file system.  We use
+ * the superblock lock in the mount structure to ensure a consistent
+ * snapshot of the counters returned.
+ */
+STATIC int
+xfs_statvfs(
+        bhv_desc_t      *bdp,
+        xfs_statfs_t    *statp,
+        vnode_t         *vp)
+{
+        __uint64_t      fakeinos;
+        xfs_extlen_t    lsize;
+        xfs_mount_t     *mp;
+        xfs_sb_t        *sbp;
+        unsigned long   s;
+        u64 id;
+        mp = XFS_BHVTOM(bdp);
+        sbp = &(mp->m_sb);
+        statp->f_type = XFS_SB_MAGIC;
+        s = XFS_SB_LOCK(mp);
+        statp->f_bsize = sbp->sb_blocksize;
+        lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+        statp->f_blocks = sbp->sb_dblocks - lsize;
+        statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks;
+        fakeinos = statp->f_bfree << sbp->sb_inopblog;
+#if XFS_BIG_INUMS
+        fakeinos += mp->m_inoadd;
+#endif
+        statp->f_files =
+            MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+        if (mp->m_maxicount)
+#if XFS_BIG_INUMS
+                if (!mp->m_inoadd)
+#endif
+                        statp->f_files = min_t(typeof(statp->f_files),
+                                                statp->f_files,
+                                                mp->m_maxicount);
+        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        XFS_SB_UNLOCK(mp, s);
+        id = huge_encode_dev(mp->m_dev);
+        statp->f_fsid.val[0] = (u32)id;
+        statp->f_fsid.val[1] = (u32)(id >> 32);
+        statp->f_namelen = MAXNAMELEN - 1;
+        return 0;
+}
+/*
+ * xfs_sync flushes any pending I/O to file system vfsp.
+ *
+ * This routine is called by vfs_sync() to make sure that things make it
+ * out to disk eventually, on sync() system calls to flush out everything,
+ * and when the file system is unmounted.  For the vfs_sync() case, all
+ * we really need to do is sync out the log to make all of our meta-data
+ * updates permanent (except for timestamps).  For calls from pflushd(),
+ * dirty pages are kept moving by calling pdflush() on the inodes
+ * containing them.  We also flush the inodes that we can lock without
+ * sleeping and the superblock if we can lock it without sleeping from
+ * vfs_sync() so that items at the tail of the log are always moving out.
+ *
+ * Flags:
+ *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
+ *                     to sleep if we can help it.  All we really need
+ *                     to do is ensure that the log is synced at least
+ *                     periodically.  We also push the inodes and
+ *                     superblock if we can lock them without sleeping
+ *                      and they are not pinned.
+ *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
+ *                     set, then we really want to lock each inode and flush
+ *                     it.
+ *      SYNC_WAIT    - All the flushes that take place in this call should
+ *                     be synchronous.
+ *      SYNC_DELWRI  - This tells us to push dirty pages associated with
+ *                     inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
+ *                     determine if they should be flushed sync, async, or
+ *                     delwri.
+ *      SYNC_CLOSE   - This flag is passed when the system is being
+ *                     unmounted.  We should sync and invalidate everthing.
+ *      SYNC_FSDATA  - This indicates that the caller would like to make
+ *                     sure the superblock is safe on disk.  We can ensure
+ *                     this by simply makeing sure the log gets flushed
+ *                     if SYNC_BDFLUSH is set, and by actually writing it
+ *                     out otherwise.
+ *
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_sync(
+        bhv_desc_t      *bdp,
+        int             flags,
+        cred_t          *credp)
+{
+        xfs_mount_t     *mp;
+        mp = XFS_BHVTOM(bdp);
+        return (xfs_syncsub(mp, flags, 0, NULL));
+}
+/*
+ * xfs sync routine for internal use
+ *
+ * This routine supports all of the flags defined for the generic VFS_SYNC
+ * interface as explained above under xfs_sync.  In the interests of not
+ * changing interfaces within the 6.5 family, additional internallly-
+ * required functions are specified within a separate xflags parameter,
+ * only available by calling this routine.
+ *
+ */
+STATIC int
+xfs_sync_inodes(
+        xfs_mount_t     *mp,
+        int             flags,
+        int             xflags,
+        int             *bypassed)
+{
+        xfs_inode_t     *ip = NULL;
+        xfs_inode_t     *ip_next;
+        xfs_buf_t       *bp;
+        vnode_t         *vp = NULL;
+        vmap_t          vmap;
+        int             error;
+        int             last_error;
+        uint64_t        fflag;
+        uint            lock_flags;
+        uint            base_lock_flags;
+        boolean_t       mount_locked;
+        boolean_t       vnode_refed;
+        int             preempt;
+        xfs_dinode_t    *dip;
+        xfs_iptr_t      *ipointer;
+#ifdef DEBUG
+        boolean_t       ipointer_in = B_FALSE;
+#define IPOINTER_SET    ipointer_in = B_TRUE
+#define IPOINTER_CLR    ipointer_in = B_FALSE
+#else
+#define IPOINTER_SET
+#define IPOINTER_CLR
+#endif
+/* Insert a marker record into the inode list after inode ip. The list
+ * must be locked when this is called. After the call the list will no
+ * longer be locked.
+ */
+#define IPOINTER_INSERT(ip, mp) { \
+                ASSERT(ipointer_in == B_FALSE); \
+                ipointer->ip_mnext = ip->i_mnext; \
+                ipointer->ip_mprev = ip; \
+                ip->i_mnext = (xfs_inode_t *)ipointer; \
+                ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
+                preempt = 0; \
+                XFS_MOUNT_IUNLOCK(mp); \
+                mount_locked = B_FALSE; \
+                IPOINTER_SET; \
+        }
+/* Remove the marker from the inode list. If the marker was the only item
+ * in the list then there are no remaining inodes and we should zero out
+ * the whole list. If we are the current head of the list then move the head
+ * past us.
+ */
+#define IPOINTER_REMOVE(ip, mp) { \
+                ASSERT(ipointer_in == B_TRUE); \
+                if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
+                        ip = ipointer->ip_mnext; \
+                        ip->i_mprev = ipointer->ip_mprev; \
+                        ipointer->ip_mprev->i_mnext = ip; \
+                        if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
+                                mp->m_inodes = ip; \
+                        } \
+                } else { \
+                        ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
+                        mp->m_inodes = NULL; \
+                        ip = NULL; \
+                } \
+                IPOINTER_CLR; \
+        }
+#define XFS_PREEMPT_MASK        0x7f
+        if (bypassed)
+                *bypassed = 0;
+        if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+                return 0;
+        error = 0;
+        last_error = 0;
+        preempt = 0;
+        /* Allocate a reference marker */
+        ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
+        fflag = XFS_B_ASYNC;            /* default is don't wait */
+        if (flags & SYNC_BDFLUSH)
+                fflag = XFS_B_DELWRI;
+        if (flags & SYNC_WAIT)
+                fflag = 0;              /* synchronous overrides all */
+        base_lock_flags = XFS_ILOCK_SHARED;
+        if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
+                /*
+                 * We need the I/O lock if we're going to call any of
+                 * the flush/inval routines.
+                 */
+                base_lock_flags |= XFS_IOLOCK_SHARED;
+        }
+        XFS_MOUNT_ILOCK(mp);
+        ip = mp->m_inodes;
+        mount_locked = B_TRUE;
+        vnode_refed  = B_FALSE;
+        IPOINTER_CLR;
+        do {
+                ASSERT(ipointer_in == B_FALSE);
+                ASSERT(vnode_refed == B_FALSE);
+                lock_flags = base_lock_flags;
+                /*
+                 * There were no inodes in the list, just break out
+                 * of the loop.
+                 */
+                if (ip == NULL) {
+                        break;
+                }
+                /*
+                 * We found another sync thread marker - skip it
+                 */
+                if (ip->i_mount == NULL) {
+                        ip = ip->i_mnext;
+                        continue;
+                }
+                vp = XFS_ITOV_NULL(ip);
+                /*
+                 * If the vnode is gone then this is being torn down,
+                 * call reclaim if it is flushed, else let regular flush
+                 * code deal with it later in the loop.
+                 */
+                if (vp == NULL) {
+                        /* Skip ones already in reclaim */
+                        if (ip->i_flags & XFS_IRECLAIM) {
+                                ip = ip->i_mnext;
+                                continue;
+                        }
+                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
+                                ip = ip->i_mnext;
+                        } else if ((xfs_ipincount(ip) == 0) &&
+                                    xfs_iflock_nowait(ip)) {
+                                IPOINTER_INSERT(ip, mp);
+                                xfs_finish_reclaim(ip, 1,
+                                                XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                                XFS_MOUNT_ILOCK(mp);
+                                mount_locked = B_TRUE;
+                                IPOINTER_REMOVE(ip, mp);
+                        } else {
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                ip = ip->i_mnext;
+                        }
+                        continue;
+                }
+                if (VN_BAD(vp)) {
+                        ip = ip->i_mnext;
+                        continue;
+                }
+                if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
+                        XFS_MOUNT_IUNLOCK(mp);
+                        kmem_free(ipointer, sizeof(xfs_iptr_t));
+                        return 0;
+                }
+                /*
+                 * If this is just vfs_sync() or pflushd() calling
+                 * then we can skip inodes for which it looks like
+                 * there is nothing to do.  Since we don't have the
+                 * inode locked this is racey, but these are periodic
+                 * calls so it doesn't matter.  For the others we want
+                 * to know for sure, so we at least try to lock them.
+                 */
+                if (flags & SYNC_BDFLUSH) {
+                        if (((ip->i_itemp == NULL) ||
+                             !(ip->i_itemp->ili_format.ilf_fields &
+                               XFS_ILOG_ALL)) &&
+                            (ip->i_update_core == 0)) {
+                                ip = ip->i_mnext;
+                                continue;
+                        }
+                }
+                /*
+                 * Try to lock without sleeping.  We're out of order with
+                 * the inode list lock here, so if we fail we need to drop
+                 * the mount lock and try again.  If we're called from
+                 * bdflush() here, then don't bother.
+                 *
+                 * The inode lock here actually coordinates with the
+                 * almost spurious inode lock in xfs_ireclaim() to prevent
+                 * the vnode we handle here without a reference from
+                 * being freed while we reference it.  If we lock the inode
+                 * while it's on the mount list here, then the spurious inode
+                 * lock in xfs_ireclaim() after the inode is pulled from
+                 * the mount list will sleep until we release it here.
+                 * This keeps the vnode from being freed while we reference
+                 * it.  It is also cheaper and simpler than actually doing
+                 * a vn_get() for every inode we touch here.
+                 */
+                if (xfs_ilock_nowait(ip, lock_flags) == 0) {
+                        if ((flags & SYNC_BDFLUSH) || (vp == NULL)) {
+                                ip = ip->i_mnext;
+                                continue;
+                        }
+                        /*
+                         * We need to unlock the inode list lock in order
+                         * to lock the inode. Insert a marker record into
+                         * the inode list to remember our position, dropping
+                         * the lock is now done inside the IPOINTER_INSERT
+                         * macro.
+                         *
+                         * We also use the inode list lock to protect us
+                         * in taking a snapshot of the vnode version number
+                         * for use in calling vn_get().
+                         */
+                        VMAP(vp, vmap);
+                        IPOINTER_INSERT(ip, mp);
+                        vp = vn_get(vp, &vmap);
+                        if (vp == NULL) {
+                                /*
+                                 * The vnode was reclaimed once we let go
+                                 * of the inode list lock.  Skip to the
+                                 * next list entry. Remove the marker.
+                                 */
+                                XFS_MOUNT_ILOCK(mp);
+                                mount_locked = B_TRUE;
+                                vnode_refed  = B_FALSE;
+                                IPOINTER_REMOVE(ip, mp);
+                                continue;
+                        }
+                        xfs_ilock(ip, lock_flags);
+                        ASSERT(vp == XFS_ITOV(ip));
+                        ASSERT(ip->i_mount == mp);
+                        vnode_refed = B_TRUE;
+                }
+                /* From here on in the loop we may have a marker record
+                 * in the inode list.
+                 */
+                if ((flags & SYNC_CLOSE)  && (vp != NULL)) {
+                        /*
+                         * This is the shutdown case.  We just need to
+                         * flush and invalidate all the pages associated
+                         * with the inode.  Drop the inode lock since
+                         * we can't hold it across calls to the buffer
+                         * cache.
+                         *
+                         * We don't set the VREMAPPING bit in the vnode
+                         * here, because we don't hold the vnode lock
+                         * exclusively.  It doesn't really matter, though,
+                         * because we only come here when we're shutting
+                         * down anyway.
+                         */
+                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                        if (XFS_FORCED_SHUTDOWN(mp)) {
+                                VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+                        } else {
+                                VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF);
+                        }
+                        xfs_ilock(ip, XFS_ILOCK_SHARED);
+                } else if ((flags & SYNC_DELWRI) && (vp != NULL)) {
+                        if (VN_DIRTY(vp)) {
+                                /* We need to have dropped the lock here,
+                                 * so insert a marker if we have not already
+                                 * done so.
+                                 */
+                                if (mount_locked) {
+                                        IPOINTER_INSERT(ip, mp);
+                                }
+                                /*
+                                 * Drop the inode lock since we can't hold it
+                                 * across calls to the buffer cache.
+                                 */
+                                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                                VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1,
+                                                        fflag, FI_NONE, error);
+                                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                        }
+                }
+                if (flags & SYNC_BDFLUSH) {
+                        if ((flags & SYNC_ATTR) &&
+                            ((ip->i_update_core) ||
+                             ((ip->i_itemp != NULL) &&
+                              (ip->i_itemp->ili_format.ilf_fields != 0)))) {
+                                /* Insert marker and drop lock if not already
+                                 * done.
+                                 */
+                                if (mount_locked) {
+                                        IPOINTER_INSERT(ip, mp);
+                                }
+                                /*
+                                 * We don't want the periodic flushing of the
+                                 * inodes by vfs_sync() to interfere with
+                                 * I/O to the file, especially read I/O
+                                 * where it is only the access time stamp
+                                 * that is being flushed out.  To prevent
+                                 * long periods where we have both inode
+                                 * locks held shared here while reading the
+                                 * inode's buffer in from disk, we drop the
+                                 * inode lock while reading in the inode
+                                 * buffer.  We have to release the buffer
+                                 * and reacquire the inode lock so that they
+                                 * are acquired in the proper order (inode
+                                 * locks first).  The buffer will go at the
+                                 * end of the lru chain, though, so we can
+                                 * expect it to still be there when we go
+                                 * for it again in xfs_iflush().
+                                 */
+                                if ((xfs_ipincount(ip) == 0) &&
+                                    xfs_iflock_nowait(ip)) {
+                                        xfs_ifunlock(ip);
+                                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                                        error = xfs_itobp(mp, NULL, ip,
+                                                          &dip, &bp, 0);
+                                        if (!error) {
+                                                xfs_buf_relse(bp);
+                                        } else {
+                                                /* Bailing out, remove the
+                                                 * marker and free it.
+                                                 */
+                                                XFS_MOUNT_ILOCK(mp);
+                                                IPOINTER_REMOVE(ip, mp);
+                                                XFS_MOUNT_IUNLOCK(mp);
+                                                ASSERT(!(lock_flags &
+                                                        XFS_IOLOCK_SHARED));
+                                                kmem_free(ipointer,
+                                                        sizeof(xfs_iptr_t));
+                                                return (0);
+                                        }
+                                        /*
+                                         * Since we dropped the inode lock,
+                                         * the inode may have been reclaimed.
+                                         * Therefore, we reacquire the mount
+                                         * lock and check to see if we were the
+                                         * inode reclaimed. If this happened
+                                         * then the ipointer marker will no
+                                         * longer point back at us. In this
+                                         * case, move ip along to the inode
+                                         * after the marker, remove the marker
+                                         * and continue.
+                                         */
+                                        XFS_MOUNT_ILOCK(mp);
+                                        mount_locked = B_TRUE;
+                                        if (ip != ipointer->ip_mprev) {
+                                                IPOINTER_REMOVE(ip, mp);
+                                                ASSERT(!vnode_refed);
+                                                ASSERT(!(lock_flags &
+                                                        XFS_IOLOCK_SHARED));
+                                                continue;
+                                        }
+                                        ASSERT(ip->i_mount == mp);
+                                        if (xfs_ilock_nowait(ip,
+                                                    XFS_ILOCK_SHARED) == 0) {
+                                                ASSERT(ip->i_mount == mp);
+                                                /*
+                                                 * We failed to reacquire
+                                                 * the inode lock without
+                                                 * sleeping, so just skip
+                                                 * the inode for now.  We
+                                                 * clear the ILOCK bit from
+                                                 * the lock_flags so that we
+                                                 * won't try to drop a lock
+                                                 * we don't hold below.
+                                                 */
+                                                lock_flags &= ~XFS_ILOCK_SHARED;
+                                                IPOINTER_REMOVE(ip_next, mp);
+                                        } else if ((xfs_ipincount(ip) == 0) &&
+                                                   xfs_iflock_nowait(ip)) {
+                                                ASSERT(ip->i_mount == mp);
+                                                /*
+                                                 * Since this is vfs_sync()
+                                                 * calling we only flush the
+                                                 * inode out if we can lock
+                                                 * it without sleeping and
+                                                 * it is not pinned.  Drop
+                                                 * the mount lock here so
+                                                 * that we don't hold it for
+                                                 * too long. We already have
+                                                 * a marker in the list here.
+                                                 */
+                                                XFS_MOUNT_IUNLOCK(mp);
+                                                mount_locked = B_FALSE;
+                                                error = xfs_iflush(ip,
+                                                           XFS_IFLUSH_DELWRI);
+                                        } else {
+                                                ASSERT(ip->i_mount == mp);
+                                                IPOINTER_REMOVE(ip_next, mp);
+                                        }
+                                }
+                        }
+                } else {
+                        if ((flags & SYNC_ATTR) &&
+                            ((ip->i_update_core) ||
+                             ((ip->i_itemp != NULL) &&
+                              (ip->i_itemp->ili_format.ilf_fields != 0)))) {
+                                if (mount_locked) {
+                                        IPOINTER_INSERT(ip, mp);
+                                }
+                                if (flags & SYNC_WAIT) {
+                                        xfs_iflock(ip);
+                                        error = xfs_iflush(ip,
+                                                           XFS_IFLUSH_SYNC);
+                                } else {
+                                        /*
+                                         * If we can't acquire the flush
+                                         * lock, then the inode is already
+                                         * being flushed so don't bother
+                                         * waiting.  If we can lock it then
+                                         * do a delwri flush so we can
+                                         * combine multiple inode flushes
+                                         * in each disk write.
+                                         */
+                                        if (xfs_iflock_nowait(ip)) {
+                                                error = xfs_iflush(ip,
+                                                           XFS_IFLUSH_DELWRI);
+                                        }
+                                        else if (bypassed)
+                                                (*bypassed)++;
+                                }
+                        }
+                }
+                if (lock_flags != 0) {
+                        xfs_iunlock(ip, lock_flags);
+                }
+                if (vnode_refed) {
+                        /*
+                         * If we had to take a reference on the vnode
+                         * above, then wait until after we've unlocked
+                         * the inode to release the reference.  This is
+                         * because we can be already holding the inode
+                         * lock when VN_RELE() calls xfs_inactive().
+                         *
+                         * Make sure to drop the mount lock before calling
+                         * VN_RELE() so that we don't trip over ourselves if
+                         * we have to go for the mount lock again in the
+                         * inactive code.
+                         */
+                        if (mount_locked) {
+                                IPOINTER_INSERT(ip, mp);
+                        }
+                        VN_RELE(vp);
+                        vnode_refed = B_FALSE;
+                }
+                if (error) {
+                        last_error = error;
+                }
+                /*
+                 * bail out if the filesystem is corrupted.
+                 */
+                if (error == EFSCORRUPTED)  {
+                        if (!mount_locked) {
+                                XFS_MOUNT_ILOCK(mp);
+                                IPOINTER_REMOVE(ip, mp);
+                        }
+                        XFS_MOUNT_IUNLOCK(mp);
+                        ASSERT(ipointer_in == B_FALSE);
+                        kmem_free(ipointer, sizeof(xfs_iptr_t));
+                        return XFS_ERROR(error);
+                }
+                /* Let other threads have a chance at the mount lock
+                 * if we have looped many times without dropping the
+                 * lock.
+                 */
+                if ((++preempt & XFS_PREEMPT_MASK) == 0) {
+                        if (mount_locked) {
+                                IPOINTER_INSERT(ip, mp);
+                        }
+                }
+                if (mount_locked == B_FALSE) {
+                        XFS_MOUNT_ILOCK(mp);
+                        mount_locked = B_TRUE;
+                        IPOINTER_REMOVE(ip, mp);
+                        continue;
+                }
+                ASSERT(ipointer_in == B_FALSE);
+                ip = ip->i_mnext;
+        } while (ip != mp->m_inodes);
+        XFS_MOUNT_IUNLOCK(mp);
+        ASSERT(ipointer_in == B_FALSE);
+        kmem_free(ipointer, sizeof(xfs_iptr_t));
+        return XFS_ERROR(last_error);
+}
+/*
+ * xfs sync routine for internal use
+ *
+ * This routine supports all of the flags defined for the generic VFS_SYNC
+ * interface as explained above under xfs_sync.  In the interests of not
+ * changing interfaces within the 6.5 family, additional internallly-
+ * required functions are specified within a separate xflags parameter,
+ * only available by calling this routine.
+ *
+ */
+int
+xfs_syncsub(
+        xfs_mount_t     *mp,
+        int             flags,
+        int             xflags,
+        int             *bypassed)
+{
+        int             error = 0;
+        int             last_error = 0;
+        uint            log_flags = XFS_LOG_FORCE;
+        xfs_buf_t       *bp;
+        xfs_buf_log_item_t      *bip;
+        /*
+         * Sync out the log.  This ensures that the log is periodically
+         * flushed even if there is not enough activity to fill it up.
+         */
+        if (flags & SYNC_WAIT)
+                log_flags |= XFS_LOG_SYNC;
+        xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
+        if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
+                if (flags & SYNC_BDFLUSH)
+                        xfs_finish_reclaim_all(mp, 1);
+                else
+                        error = xfs_sync_inodes(mp, flags, xflags, bypassed);
+        }
+        /*
+         * Flushing out dirty data above probably generated more
+         * log activity, so if this isn't vfs_sync() then flush
+         * the log again.
+         */
+        if (flags & SYNC_DELWRI) {
+                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
+        }
+        if (flags & SYNC_FSDATA) {
+                /*
+                 * If this is vfs_sync() then only sync the superblock
+                 * if we can lock it without sleeping and it is not pinned.
+                 */
+                if (flags & SYNC_BDFLUSH) {
+                        bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+                        if (bp != NULL) {
+                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+                                if ((bip != NULL) &&
+                                    xfs_buf_item_dirty(bip)) {
+                                        if (!(XFS_BUF_ISPINNED(bp))) {
+                                                XFS_BUF_ASYNC(bp);
+                                                error = xfs_bwrite(mp, bp);
+                                        } else {
+                                                xfs_buf_relse(bp);
+                                        }
+                                } else {
+                                        xfs_buf_relse(bp);
+                                }
+                        }
+                } else {
+                        bp = xfs_getsb(mp, 0);
+                        /*
+                         * If the buffer is pinned then push on the log so
+                         * we won't get stuck waiting in the write for
+                         * someone, maybe ourselves, to flush the log.
+                         * Even though we just pushed the log above, we
+                         * did not have the superblock buffer locked at
+                         * that point so it can become pinned in between
+                         * there and here.
+                         */
+                        if (XFS_BUF_ISPINNED(bp))
+                                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                        if (flags & SYNC_WAIT)
+                                XFS_BUF_UNASYNC(bp);
+                        else
+                                XFS_BUF_ASYNC(bp);
+                        error = xfs_bwrite(mp, bp);
+                }
+                if (error) {
+                        last_error = error;
+                }
+        }
+        /*
+         * If this is the periodic sync, then kick some entries out of
+         * the reference cache.  This ensures that idle entries are
+         * eventually kicked out of the cache.
+         */
+        if (flags & SYNC_REFCACHE) {
+                xfs_refcache_purge_some(mp);
+        }
+        /*
+         * Now check to see if the log needs a "dummy" transaction.
+         */
+        if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
+                xfs_trans_t *tp;
+                xfs_inode_t *ip;
+                /*
+                 * Put a dummy transaction in the log to tell
+                 * recovery that all others are OK.
+                 */
+                tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+                if ((error = xfs_trans_reserve(tp, 0,
+                                XFS_ICHANGE_LOG_RES(mp),
+                                0, 0, 0)))  {
+                        xfs_trans_cancel(tp, 0);
+                        return error;
+                }
+                ip = mp->m_rootip;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                error = xfs_trans_commit(tp, 0, NULL);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
+        }
+        /*
+         * When shutting down, we need to insure that the AIL is pushed
+         * to disk or the filesystem can appear corrupt from the PROM.
+         */
+        if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
+                XFS_bflush(mp->m_ddev_targp);
+                if (mp->m_rtdev_targp) {
+                        XFS_bflush(mp->m_rtdev_targp);
+                }
+        }
+        return XFS_ERROR(last_error);
+}
+/*
+ * xfs_vget - called by DMAPI and NFSD to get vnode from file handle
+ */
+STATIC int
+xfs_vget(
+        bhv_desc_t      *bdp,
+        vnode_t         **vpp,
+        fid_t           *fidp)
+{
+        xfs_mount_t     *mp = XFS_BHVTOM(bdp);
+        xfs_fid_t       *xfid = (struct xfs_fid *)fidp;
+        xfs_inode_t     *ip;
+        int             error;
+        xfs_ino_t       ino;
+        unsigned int    igen;
+        /*
+         * Invalid.  Since handles can be created in user space and passed in
+         * via gethandle(), this is not cause for a panic.
+         */
+        if (xfid->xfs_fid_len != sizeof(*xfid) - sizeof(xfid->xfs_fid_len))
+                return XFS_ERROR(EINVAL);
+        ino  = xfid->xfs_fid_ino;
+        igen = xfid->xfs_fid_gen;
+        /*
+         * NFS can sometimes send requests for ino 0.  Fail them gracefully.
+         */
+        if (ino == 0)
+                return XFS_ERROR(ESTALE);
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+        if (error) {
+                *vpp = NULL;
+                return error;
+        }
+        if (ip == NULL) {
+                *vpp = NULL;
+                return XFS_ERROR(EIO);
+        }
+        if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
+                xfs_iput_new(ip, XFS_ILOCK_SHARED);
+                *vpp = NULL;
+                return XFS_ERROR(ENOENT);
+        }
+        *vpp = XFS_ITOV(ip);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return 0;
+}
+#define MNTOPT_LOGBUFS  "logbufs"       /* number of XFS log buffers */
+#define MNTOPT_LOGBSIZE "logbsize"      /* size of XFS log buffers */
+#define MNTOPT_LOGDEV   "logdev"        /* log device */
+#define MNTOPT_RTDEV    "rtdev"         /* realtime I/O device */
+#define MNTOPT_BIOSIZE  "biosize"       /* log2 of preferred buffered io size */
+#define MNTOPT_WSYNC    "wsync"         /* safe-mode nfs compatible mount */
+#define MNTOPT_INO64    "ino64"         /* force inodes into 64-bit range */
+#define MNTOPT_NOALIGN  "noalign"       /* turn off stripe alignment */
+#define MNTOPT_SWALLOC  "swalloc"       /* turn on stripe width allocation */
+#define MNTOPT_SUNIT    "sunit"         /* data volume stripe unit */
+#define MNTOPT_SWIDTH   "swidth"        /* data volume stripe width */
+#define MNTOPT_NOUUID   "nouuid"        /* ignore filesystem UUID */
+#define MNTOPT_MTPT     "mtpt"          /* filesystem mount point */
+#define MNTOPT_IHASHSIZE    "ihashsize"    /* size of inode hash table */
+#define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
+#define MNTOPT_NOLOGFLUSH   "nologflush"   /* don't hard flush on log writes */
+#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
+#define MNTOPT_64BITINODE   "inode64"   /* inodes can be allocated anywhere */
+#define MNTOPT_IKEEP    "ikeep"         /* do not free empty inode clusters */
+#define MNTOPT_NOIKEEP  "noikeep"       /* free empty inode clusters */
+int
+xfs_parseargs(
+        struct bhv_desc         *bhv,
+        char                    *options,
+        struct xfs_mount_args   *args,
+        int                     update)
+{
+        struct vfs              *vfsp = bhvtovfs(bhv);
+        char                    *this_char, *value, *eov;
+        int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
+        int                     iosize;
+#if 0   /* XXX: off by default, until some remaining issues ironed out */
+        args->flags |= XFSMNT_IDELETE; /* default to on */
+#endif
+        if (!options)
+                return 0;
+        iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
+        while ((this_char = strsep(&options, ",")) != NULL) {
+                if (!*this_char)
+                        continue;
+                if ((value = strchr(this_char, '=')) != NULL)
+                        *value++ = 0;
+                if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_LOGBUFS);
+                                return EINVAL;
+                        }
+                        args->logbufs = simple_strtoul(value, &eov, 10);
+                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
+                        int     last, in_kilobytes = 0;
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_LOGBSIZE);
+                                return EINVAL;
+                        }
+                        last = strlen(value) - 1;
+                        if (value[last] == 'K' || value[last] == 'k') {
+                                in_kilobytes = 1;
+                                value[last] = '\0';
+                        }
+                        args->logbufsize = simple_strtoul(value, &eov, 10);
+                        if (in_kilobytes)
+                                args->logbufsize <<= 10;
+                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_LOGDEV);
+                                return EINVAL;
+                        }
+                        strncpy(args->logname, value, MAXNAMELEN);
+                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_MTPT);
+                                return EINVAL;
+                        }
+                        strncpy(args->mtpt, value, MAXNAMELEN);
+                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_RTDEV);
+                                return EINVAL;
+                        }
+                        strncpy(args->rtname, value, MAXNAMELEN);
+                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_BIOSIZE); 
+                                return EINVAL;
+                        }
+                        iosize = simple_strtoul(value, &eov, 10);
+                        args->flags |= XFSMNT_IOSIZE;
+                        args->iosizelog = (uint8_t) iosize;
+                } else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        this_char); 
+                                return EINVAL;
+                        }
+                        args->flags |= XFSMNT_IHASHSIZE;
+                        args->ihashsize = simple_strtoul(value, &eov, 10);
+                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+                        args->flags |= XFSMNT_WSYNC;
+                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
+                        args->flags |= XFSMNT_OSYNCISOSYNC;
+                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+                        args->flags |= XFSMNT_NORECOVERY;
+                } else if (!strcmp(this_char, MNTOPT_INO64)) {
+                        args->flags |= XFSMNT_INO64;
+#if !XFS_BIG_INUMS
+                        printk("XFS: %s option not allowed on this system\n",
+                                MNTOPT_INO64);
+                        return EINVAL;
+#endif
+                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+                        args->flags |= XFSMNT_NOALIGN;
+                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+                        args->flags |= XFSMNT_SWALLOC;
+                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_SUNIT);
+                                return EINVAL;
+                        }
+                        dsunit = simple_strtoul(value, &eov, 10);
+                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
+                        if (!value || !*value) {
+                                printk("XFS: %s option requires an argument\n",
+                                        MNTOPT_SWIDTH);
+                                return EINVAL;
+                        }
+                        dswidth = simple_strtoul(value, &eov, 10);
+                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+                        args->flags &= ~XFSMNT_32BITINODES;
+#if !XFS_BIG_INUMS
+                        printk("XFS: %s option not allowed on this system\n",
+                                MNTOPT_64BITINODE);
+                        return EINVAL;
+#endif
+                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+                        args->flags |= XFSMNT_NOUUID;
+                } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) {
+                        args->flags |= XFSMNT_NOLOGFLUSH;
+                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+                        args->flags &= ~XFSMNT_IDELETE;
+                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+                        args->flags |= XFSMNT_IDELETE;
+                } else if (!strcmp(this_char, "osyncisdsync")) {
+                        /* no-op, this is now the default */
+printk("XFS: osyncisdsync is now the default, option is deprecated.\n");
+                } else if (!strcmp(this_char, "irixsgid")) {
+printk("XFS: irixsgid is now a sysctl(2) variable, option is deprecated.\n");
+                } else {
+                        printk("XFS: unknown mount option [%s].\n", this_char);
+                        return EINVAL;
+                }
+        }
+        if (args->flags & XFSMNT_NORECOVERY) {
+                if ((vfsp->vfs_flag & VFS_RDONLY) == 0) {
+                        printk("XFS: no-recovery mounts must be read-only.\n");
+                        return EINVAL;
+                }
+        }
+        if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+                printk(
+        "XFS: sunit and swidth options incompatible with the noalign option\n");
+                return EINVAL;
+        }
+        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
+                printk("XFS: sunit and swidth must be specified together\n");
+                return EINVAL;
+        }
+        if (dsunit && (dswidth % dsunit != 0)) {
+                printk(
+        "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n",
+                        dswidth, dsunit);
+                return EINVAL;
+        }
+        if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+                if (dsunit) {
+                        args->sunit = dsunit;
+                        args->flags |= XFSMNT_RETERR;
+                } else {
+                        args->sunit = vol_dsunit;
+                }
+                dswidth ? (args->swidth = dswidth) :
+                          (args->swidth = vol_dswidth);
+        } else {
+                args->sunit = args->swidth = 0;
+        }
+        return 0;
+}
+int
+xfs_showargs(
+        struct bhv_desc         *bhv,
+        struct seq_file         *m)
+{
+        static struct proc_xfs_info {
+                int     flag;
+                char    *str;
+        } xfs_info[] = {
+                /* the few simple ones we can get from the mount struct */
+                { XFS_MOUNT_WSYNC,              "," MNTOPT_WSYNC },
+                { XFS_MOUNT_INO64,              "," MNTOPT_INO64 },
+                { XFS_MOUNT_NOALIGN,            "," MNTOPT_NOALIGN },
+                { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
+                { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
+                { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
+                { XFS_MOUNT_OSYNCISOSYNC,       "," MNTOPT_OSYNCISOSYNC },
+                { XFS_MOUNT_NOLOGFLUSH,         "," MNTOPT_NOLOGFLUSH },
+                { XFS_MOUNT_IDELETE,            "," MNTOPT_NOIKEEP },
+                { 0, NULL }
+        };
+        struct proc_xfs_info    *xfs_infop;
+        struct xfs_mount        *mp = XFS_BHVTOM(bhv);
+        for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
+                if (mp->m_flags & xfs_infop->flag)
+                        seq_puts(m, xfs_infop->str);
+        }
+        if (mp->m_flags & XFS_MOUNT_IHASHSIZE)
+                seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize);
+        if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+                seq_printf(m, "," MNTOPT_BIOSIZE "=%d", mp->m_writeio_log);
+        if (mp->m_logbufs > 0)
+                seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+        if (mp->m_logbsize > 0)
+                seq_printf(m, "," MNTOPT_LOGBSIZE "=%d", mp->m_logbsize);
+        if (mp->m_ddev_targp != mp->m_logdev_targp)
+                seq_printf(m, "," MNTOPT_LOGDEV "=%s",
+                                XFS_BUFTARG_NAME(mp->m_logdev_targp));
+        if (mp->m_rtdev_targp && mp->m_ddev_targp != mp->m_rtdev_targp)
+                seq_printf(m, "," MNTOPT_RTDEV "=%s",
+                                XFS_BUFTARG_NAME(mp->m_rtdev_targp));
+        if (mp->m_dalign > 0)
+                seq_printf(m, "," MNTOPT_SUNIT "=%d",
+                                (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+        if (mp->m_swidth > 0)
+                seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+                                (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+        if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT))
+                seq_printf(m, "," MNTOPT_64BITINODE);
+        
+        return 0;
+}
+STATIC void
+xfs_freeze(
+        bhv_desc_t      *bdp)
+{
+        xfs_mount_t     *mp = XFS_BHVTOM(bdp);
+        while (atomic_read(&mp->m_active_trans) > 0)
+                delay(100);
+        /* Push the superblock and write an unmount record */
+        xfs_log_unmount_write(mp);
+        xfs_unmountfs_writesb(mp);
+}
+vfsops_t xfs_vfsops = {
+        BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS),
+        .vfs_parseargs          = xfs_parseargs,
+        .vfs_showargs           = xfs_showargs,
+        .vfs_mount              = xfs_mount,
+        .vfs_unmount            = xfs_unmount,
+        .vfs_mntupdate          = xfs_mntupdate,
+        .vfs_root               = xfs_root,
+        .vfs_statvfs            = xfs_statvfs,
+        .vfs_sync               = xfs_sync,
+        .vfs_vget               = xfs_vget,
+        .vfs_dmapiops           = (vfs_dmapiops_t)fs_nosys,
+        .vfs_quotactl           = (vfs_quotactl_t)fs_nosys,
+        .vfs_init_vnode         = xfs_initialize_vnode,
+        .vfs_force_shutdown     = xfs_do_force_shutdown,
+        .vfs_freeze             = xfs_freeze,
+};
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
new file mode 100644
index 000000000000..70092963ca9e
--- /dev/null
+++ b/fs/xfs/xfs_vnodeops.c
@@ -0,0 +1,4712 @@
+/*
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#include "xfs.h"
+#include "xfs_macros.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_itable.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_alloc.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dir_sf.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_dinode.h"
+#include "xfs_inode_item.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_rw.h"
+#include "xfs_refcache.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_rtalloc.h"
+#include "xfs_quota.h"
+#include "xfs_utils.h"
+#include "xfs_trans_space.h"
+#include "xfs_dir_leaf.h"
+#include "xfs_mac.h"
+#include "xfs_log_priv.h"
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+/*
+ * For xfs, we check that the file isn't too big to be opened by this kernel.
+ * No other open action is required for regular files.  Devices are handled
+ * through the specfs file system, pipes through fifofs.  Device and
+ * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
+ * when a new vnode is first looked up or created.
+ */
+STATIC int
+xfs_open(
+        bhv_desc_t      *bdp,
+        cred_t          *credp)
+{
+        int             mode;
+        vnode_t         *vp;
+        xfs_inode_t     *ip;
+        vp = BHV_TO_VNODE(bdp);
+        ip = XFS_BHVTOI(bdp);
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return XFS_ERROR(EIO);
+        /*
+         * If it's a directory with any blocks, read-ahead block 0
+         * as we're almost certain to have the next operation be a read there.
+         */
+        if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
+                mode = xfs_ilock_map_shared(ip);
+                if (ip->i_d.di_nextents > 0)
+                        (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+                xfs_iunlock(ip, mode);
+        }
+        return 0;
+}
+/*
+ * xfs_getattr
+ */
+STATIC int
+xfs_getattr(
+        bhv_desc_t      *bdp,
+        vattr_t         *vap,
+        int             flags,
+        cred_t          *credp)
+{
+        xfs_inode_t     *ip;
+        xfs_mount_t     *mp;
+        vnode_t         *vp;
+        vp  = BHV_TO_VNODE(bdp);
+        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        if (!(flags & ATTR_LAZY))
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+        vap->va_size = ip->i_d.di_size;
+        if (vap->va_mask == XFS_AT_SIZE)
+                goto all_done;
+        vap->va_nblocks =
+                XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+        vap->va_nodeid = ip->i_ino;
+#if XFS_BIG_INUMS
+        vap->va_nodeid += mp->m_inoadd;
+#endif
+        vap->va_nlink = ip->i_d.di_nlink;
+        /*
+         * Quick exit for non-stat callers
+         */
+        if ((vap->va_mask &
+            ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
+              XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
+                goto all_done;
+        /*
+         * Copy from in-core inode.
+         */
+        vap->va_type = vp->v_type;
+        vap->va_mode = ip->i_d.di_mode & MODEMASK;
+        vap->va_uid = ip->i_d.di_uid;
+        vap->va_gid = ip->i_d.di_gid;
+        vap->va_projid = ip->i_d.di_projid;
+        /*
+         * Check vnode type block/char vs. everything else.
+         * Do it with bitmask because that's faster than looking
+         * for multiple values individually.
+         */
+        if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+                vap->va_rdev = 0;
+                if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+#if 0
+                        /* Large block sizes confuse various
+                         * user space programs, so letting the
+                         * stripe size through is not a good
+                         * idea for now.
+                         */
+                        vap->va_blocksize = mp->m_swidth ?
+                                /*
+                                 * If the underlying volume is a stripe, then
+                                 * return the stripe width in bytes as the
+                                 * recommended I/O size.
+                                 */
+                                (mp->m_swidth << mp->m_sb.sb_blocklog) :
+                                /*
+                                 * Return the largest of the preferred buffer
+                                 * sizes since doing small I/Os into larger
+                                 * buffers causes buffers to be decommissioned.
+                                 * The value returned is in bytes.
+                                 */
+                                (1 << (int)MAX(mp->m_readio_log,
+                                               mp->m_writeio_log));
+#else
+                        vap->va_blocksize =
+                                /*
+                                 * Return the largest of the preferred buffer
+                                 * sizes since doing small I/Os into larger
+                                 * buffers causes buffers to be decommissioned.
+                                 * The value returned is in bytes.
+                                 */
+                                1 << (int)MAX(mp->m_readio_log,
+                                               mp->m_writeio_log);
+#endif
+                } else {
+                        /*
+                         * If the file blocks are being allocated from a
+                         * realtime partition, then return the inode's
+                         * realtime extent size or the realtime volume's
+                         * extent size.
+                         */
+                        vap->va_blocksize = ip->i_d.di_extsize ?
+                                (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
+                                (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
+                }
+        } else {
+                vap->va_rdev = ip->i_df.if_u2.if_rdev;
+                vap->va_blocksize = BLKDEV_IOSIZE;
+        }
+        vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
+        vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
+        vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
+        vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
+        vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
+        vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+        /*
+         * Exit for stat callers.  See if any of the rest of the fields
+         * to be filled in are needed.
+         */
+        if ((vap->va_mask &
+             (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
+              XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
+                goto all_done;
+        /*
+         * Convert di_flags to xflags.
+         */
+        vap->va_xflags = xfs_ip2xflags(ip);
+        /*
+         * Exit for inode revalidate.  See if any of the rest of
+         * the fields to be filled in are needed.
+         */
+        if ((vap->va_mask &
+             (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
+              XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
+                goto all_done;
+        vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
+        vap->va_nextents =
+                (ip->i_df.if_flags & XFS_IFEXTENTS) ?
+                        ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
+                        ip->i_d.di_nextents;
+        if (ip->i_afp)
+                vap->va_anextents =
+                        (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
+                                ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
+                                 ip->i_d.di_anextents;
+        else
+                vap->va_anextents = 0;
+        vap->va_gen = ip->i_d.di_gen;
+ all_done:
+        if (!(flags & ATTR_LAZY))
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return 0;
+}
+/*
+ * xfs_setattr
+ */
+int
+xfs_setattr(
+        bhv_desc_t              *bdp,
+        vattr_t                 *vap,
+        int                     flags,
+        cred_t                  *credp)
+{
+        xfs_inode_t             *ip;
+        xfs_trans_t             *tp;
+        xfs_mount_t             *mp;
+        int                     mask;
+        int                     code;
+        uint                    lock_flags;
+        uint                    commit_flags=0;
+        uid_t                   uid=0, iuid=0;
+        gid_t                   gid=0, igid=0;
+        int                     timeflags = 0;
+        vnode_t                 *vp;
+        xfs_prid_t              projid=0, iprojid=0;
+        int                     mandlock_before, mandlock_after;
+        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
+        int                     file_owner;
+        int                     need_iolock = (flags & ATTR_DMI) == 0;
+        vp = BHV_TO_VNODE(bdp);
+        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+                return XFS_ERROR(EROFS);
+        /*
+         * Cannot set certain attributes.
+         */
+        mask = vap->va_mask;
+        if (mask & XFS_AT_NOSET) {
+                return XFS_ERROR(EINVAL);
+        }
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        /*
+         * Timestamps do not need to be logged and hence do not
+         * need to be done within a transaction.
+         */
+        if (mask & XFS_AT_UPDTIMES) {
+                ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
+                timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
+                            ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
+                            ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
+                xfs_ichgtime(ip, timeflags);
+                return 0;
+        }
+        olddquot1 = olddquot2 = NULL;
+        udqp = gdqp = NULL;
+        /*
+         * If disk quotas is on, we make sure that the dquots do exist on disk,
+         * before we start any other transactions. Trying to do this later
+         * is messy. We don't care to take a readlock to look at the ids
+         * in inode here, because we can't hold it across the trans_reserve.
+         * If the IDs do change before we take the ilock, we're covered
+         * because the i_*dquot fields will get updated anyway.
+         */
+        if (XFS_IS_QUOTA_ON(mp) && (mask & (XFS_AT_UID|XFS_AT_GID))) {
+                uint    qflags = 0;
+                if (mask & XFS_AT_UID) {
+                        uid = vap->va_uid;
+                        qflags |= XFS_QMOPT_UQUOTA;
+                } else {
+                        uid = ip->i_d.di_uid;
+                }
+                if (mask & XFS_AT_GID) {
+                        gid = vap->va_gid;
+                        qflags |= XFS_QMOPT_GQUOTA;
+                }  else {
+                        gid = ip->i_d.di_gid;
+                }
+                /*
+                 * We take a reference when we initialize udqp and gdqp,
+                 * so it is important that we never blindly double trip on
+                 * the same variable. See xfs_create() for an example.
+                 */
+                ASSERT(udqp == NULL);
+                ASSERT(gdqp == NULL);
+                code = XFS_QM_DQVOPALLOC(mp, ip, uid,gid, qflags, &udqp, &gdqp);
+                if (code)
+                        return (code);
+        }
+        /*
+         * For the other attributes, we acquire the inode lock and
+         * first do an error checking pass.
+         */
+        tp = NULL;
+        lock_flags = XFS_ILOCK_EXCL;
+        if (!(mask & XFS_AT_SIZE)) {
+                if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
+                    (mp->m_flags & XFS_MOUNT_WSYNC)) {
+                        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+                        commit_flags = 0;
+                        if ((code = xfs_trans_reserve(tp, 0,
+                                                     XFS_ICHANGE_LOG_RES(mp), 0,
+                                                     0, 0))) {
+                                lock_flags = 0;
+                                goto error_return;
+                        }
+                }
+        } else {
+                if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
+                    !(flags & ATTR_DMI)) {
+                        int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
+                        code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
+                                vap->va_size, 0, dmflags, NULL);
+                        if (code) {
+                                lock_flags = 0;
+                                goto error_return;
+                        }
+                }
+                if (need_iolock)
+                        lock_flags |= XFS_IOLOCK_EXCL;
+        }
+        xfs_ilock(ip, lock_flags);
+        /* boolean: are we the file owner? */
+        file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
+        /*
+         * Change various properties of a file.
+         * Only the owner or users with CAP_FOWNER
+         * capability may do these things.
+         */
+        if (mask &
+            (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
+             XFS_AT_GID|XFS_AT_PROJID)) {
+                /*
+                 * CAP_FOWNER overrides the following restrictions:
+                 *
+                 * The user ID of the calling process must be equal
+                 * to the file owner ID, except in cases where the
+                 * CAP_FSETID capability is applicable.
+                 */
+                if (!file_owner && !capable(CAP_FOWNER)) {
+                        code = XFS_ERROR(EPERM);
+                        goto error_return;
+                }
+                /*
+                 * CAP_FSETID overrides the following restrictions:
+                 *
+                 * The effective user ID of the calling process shall match
+                 * the file owner when setting the set-user-ID and
+                 * set-group-ID bits on that file.
+                 *
+                 * The effective group ID or one of the supplementary group
+                 * IDs of the calling process shall match the group owner of
+                 * the file when setting the set-group-ID bit on that file
+                 */
+                if (mask & XFS_AT_MODE) {
+                        mode_t m = 0;
+                        if ((vap->va_mode & S_ISUID) && !file_owner)
+                                m |= S_ISUID;
+                        if ((vap->va_mode & S_ISGID) &&
+                            !in_group_p((gid_t)ip->i_d.di_gid))
+                                m |= S_ISGID;
+#if 0
+                        /* Linux allows this, Irix doesn't. */
+                        if ((vap->va_mode & S_ISVTX) && vp->v_type != VDIR)
+                                m |= S_ISVTX;
+#endif
+                        if (m && !capable(CAP_FSETID))
+                                vap->va_mode &= ~m;
+                }
+        }
+        /*
+         * Change file ownership.  Must be the owner or privileged.
+         * If the system was configured with the "restricted_chown"
+         * option, the owner is not permitted to give away the file,
+         * and can change the group id only to a group of which he
+         * or she is a member.
+         */
+        if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+                /*
+                 * These IDs could have changed since we last looked at them.
+                 * But, we're assured that if the ownership did change
+                 * while we didn't have the inode locked, inode's dquot(s)
+                 * would have changed also.
+                 */
+                iuid = ip->i_d.di_uid;
+                iprojid = ip->i_d.di_projid;
+                igid = ip->i_d.di_gid;
+                gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
+                uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
+                projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
+                         iprojid;
+                /*
+                 * CAP_CHOWN overrides the following restrictions:
+                 *
+                 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
+                 * shall override the restriction that a process cannot
+                 * change the user ID of a file it owns and the restriction
+                 * that the group ID supplied to the chown() function
+                 * shall be equal to either the group ID or one of the
+                 * supplementary group IDs of the calling process.
+                 *
+                 * XXX: How does restricted_chown affect projid?
+                 */
+                if (restricted_chown &&
+                    (iuid != uid || (igid != gid &&
+                                     !in_group_p((gid_t)gid))) &&
+                    !capable(CAP_CHOWN)) {
+                        code = XFS_ERROR(EPERM);
+                        goto error_return;
+                }
+                /*
+                 * Do a quota reservation only if uid or gid is actually
+                 * going to change.
+                 */
+                if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+                    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
+                        ASSERT(tp);
+                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+                                                capable(CAP_FOWNER) ?
+                                                XFS_QMOPT_FORCE_RES : 0);
+                        if (code)       /* out of quota */
+                                goto error_return;
+                }
+        }
+        /*
+         * Truncate file.  Must have write permission and not be a directory.
+         */
+        if (mask & XFS_AT_SIZE) {
+                /* Short circuit the truncate case for zero length files */
+                if ((vap->va_size == 0) &&
+                   (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        lock_flags &= ~XFS_ILOCK_EXCL;
+                        if (mask & XFS_AT_CTIME)
+                                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+                        code = 0;
+                        goto error_return;
+                }
+                if (vp->v_type == VDIR) {
+                        code = XFS_ERROR(EISDIR);
+                        goto error_return;
+                } else if (vp->v_type != VREG) {
+                        code = XFS_ERROR(EINVAL);
+                        goto error_return;
+                }
+                /*
+                 * Make sure that the dquots are attached to the inode.
+                 */
+                if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
+                        goto error_return;
+        }
+        /*
+         * Change file access or modified times.
+         */
+        if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
+                if (!file_owner) {
+                        if ((flags & ATTR_UTIME) &&
+                            !capable(CAP_FOWNER)) {
+                                code = XFS_ERROR(EPERM);
+                                goto error_return;
+                        }
+                }
+        }
+        /*
+         * Change extent size or realtime flag.
+         */
+        if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
+                /*
+                 * Can't change extent size if any extents are allocated.
+                 */
+                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+                    (mask & XFS_AT_EXTSIZE) &&
+                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+                     vap->va_extsize) ) {
+                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                        goto error_return;
+                }
+                /*
+                 * Can't set extent size unless the file is marked, or
+                 * about to be marked as a realtime file.
+                 *
+                 * This check will be removed when fixed size extents
+                 * with buffered data writes is implemented.
+                 *
+                 */
+                if ((mask & XFS_AT_EXTSIZE)                     &&
+                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+                     vap->va_extsize) &&
+                    (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
+                       ((mask & XFS_AT_XFLAGS) &&
+                        (vap->va_xflags & XFS_XFLAG_REALTIME))))) {
+                        code = XFS_ERROR(EINVAL);
+                        goto error_return;
+                }
+                /*
+                 * Can't change realtime flag if any extents are allocated.
+                 */
+                if (ip->i_d.di_nextents && (mask & XFS_AT_XFLAGS) &&
+                    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
+                    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
+                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                        goto error_return;
+                }
+                /*
+                 * Extent size must be a multiple of the appropriate block
+                 * size, if set at all.
+                 */
+                if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
+                        xfs_extlen_t    size;
+                        if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
+                            ((mask & XFS_AT_XFLAGS) &&
+                            (vap->va_xflags & XFS_XFLAG_REALTIME))) {
+                                size = mp->m_sb.sb_rextsize <<
+                                       mp->m_sb.sb_blocklog;
+                        } else {
+                                size = mp->m_sb.sb_blocksize;
+                        }
+                        if (vap->va_extsize % size) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
+                }
+                /*
+                 * If realtime flag is set then must have realtime data.
+                 */
+                if ((mask & XFS_AT_XFLAGS) &&
+                    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
+                        if ((mp->m_sb.sb_rblocks == 0) ||
+                            (mp->m_sb.sb_rextsize == 0) ||
+                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+                                code = XFS_ERROR(EINVAL);
+                                goto error_return;
+                        }
+                }
+                /*
+                 * Can't modify an immutable/append-only file unless
+                 * we have appropriate permission.
+                 */
+                if ((mask & XFS_AT_XFLAGS) &&
+                    (ip->i_d.di_flags &
+                                (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
+                     (vap->va_xflags &
+                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+                    !capable(CAP_LINUX_IMMUTABLE)) {
+                        code = XFS_ERROR(EPERM);
+                        goto error_return;
+                }
+        }
+        /*
+         * Now we can make the changes.  Before we join the inode
+         * to the transaction, if XFS_AT_SIZE is set then take care of
+         * the part of the truncation that must be done without the
+         * inode lock.  This needs to be done before joining the inode
+         * to the transaction, because the inode cannot be unlocked
+         * once it is a part of the transaction.
+         */
+        if (mask & XFS_AT_SIZE) {
+                code = 0;
+                if (vap->va_size > ip->i_d.di_size)
+                        code = xfs_igrow_start(ip, vap->va_size, credp);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (!code)
+                        code = xfs_itruncate_data(ip, vap->va_size);
+                if (code) {
+                        ASSERT(tp == NULL);
+                        lock_flags &= ~XFS_ILOCK_EXCL;
+                        ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+                        goto error_return;
+                }
+                tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+                if ((code = xfs_trans_reserve(tp, 0,
+                                             XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                             XFS_TRANS_PERM_LOG_RES,
+                                             XFS_ITRUNCATE_LOG_COUNT))) {
+                        xfs_trans_cancel(tp, 0);
+                        if (need_iolock)
+                                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                        return code;
+                }
+                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        }
+        if (tp) {
+                xfs_trans_ijoin(tp, ip, lock_flags);
+                xfs_trans_ihold(tp, ip);
+        }
+        /* determine whether mandatory locking mode changes */
+        mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
+        /*
+         * Truncate file.  Must have write permission and not be a directory.
+         */
+        if (mask & XFS_AT_SIZE) {
+                if (vap->va_size > ip->i_d.di_size) {
+                        xfs_igrow_finish(tp, ip, vap->va_size,
+                            !(flags & ATTR_DMI));
+                } else if ((vap->va_size <= ip->i_d.di_size) ||
+                           ((vap->va_size == 0) && ip->i_d.di_nextents)) {
+                        /*
+                         * signal a sync transaction unless
+                         * we're truncating an already unlinked
+                         * file on a wsync filesystem
+                         */
+                        code = xfs_itruncate_finish(&tp, ip,
+                                            (xfs_fsize_t)vap->va_size,
+                                            XFS_DATA_FORK,
+                                            ((ip->i_d.di_nlink != 0 ||
+                                              !(mp->m_flags & XFS_MOUNT_WSYNC))
+                                             ? 1 : 0));
+                        if (code) {
+                                goto abort_return;
+                        }
+                }
+                /*
+                 * Have to do this even if the file's size doesn't change.
+                 */
+                timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+        }
+        /*
+         * Change file access modes.
+         */
+        if (mask & XFS_AT_MODE) {
+                ip->i_d.di_mode &= S_IFMT;
+                ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
+                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+                timeflags |= XFS_ICHGTIME_CHG;
+        }
+        /*
+         * Change file ownership.  Must be the owner or privileged.
+         * If the system was configured with the "restricted_chown"
+         * option, the owner is not permitted to give away the file,
+         * and can change the group id only to a group of which he
+         * or she is a member.
+         */
+        if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+                /*
+                 * CAP_FSETID overrides the following restrictions:
+                 *
+                 * The set-user-ID and set-group-ID bits of a file will be
+                 * cleared upon successful return from chown()
+                 */
+                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+                    !capable(CAP_FSETID)) {
+                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+                }
+                /*
+                 * Change the ownerships and register quota modifications
+                 * in the transaction.
+                 */
+                if (iuid != uid) {
+                        if (XFS_IS_UQUOTA_ON(mp)) {
+                                ASSERT(mask & XFS_AT_UID);
+                                ASSERT(udqp);
+                                olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                                        &ip->i_udquot, udqp);
+                        }
+                        ip->i_d.di_uid = uid;
+                }
+                if (igid != gid) {
+                        if (XFS_IS_GQUOTA_ON(mp)) {
+                                ASSERT(mask & XFS_AT_GID);
+                                ASSERT(gdqp);
+                                olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                                        &ip->i_gdquot, gdqp);
+                        }
+                        ip->i_d.di_gid = gid;
+                }
+                if (iprojid != projid) {
+                        ip->i_d.di_projid = projid;
+                        /*
+                         * We may have to rev the inode as well as
+                         * the superblock version number since projids didn't
+                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+                         */
+                        if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+                                xfs_bump_ino_vers2(tp, ip);
+                }
+                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+                timeflags |= XFS_ICHGTIME_CHG;
+        }
+        /*
+         * Change file access or modified times.
+         */
+        if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
+                if (mask & XFS_AT_ATIME) {
+                        ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
+                        ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
+                        ip->i_update_core = 1;
+                        timeflags &= ~XFS_ICHGTIME_ACC;
+                }
+                if (mask & XFS_AT_MTIME) {
+                        ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
+                        ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
+                        timeflags &= ~XFS_ICHGTIME_MOD;
+                        timeflags |= XFS_ICHGTIME_CHG;
+                }
+                if (tp && (flags & ATTR_UTIME))
+                        xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+        }
+        /*
+         * Change XFS-added attributes.
+         */
+        if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
+                if (mask & XFS_AT_EXTSIZE) {
+                        /*
+                         * Converting bytes to fs blocks.
+                         */
+                        ip->i_d.di_extsize = vap->va_extsize >>
+                                mp->m_sb.sb_blocklog;
+                }
+                if (mask & XFS_AT_XFLAGS) {
+                        uint    di_flags;
+                        /* can't set PREALLOC this way, just preserve it */
+                        di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+                        if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
+                                di_flags |= XFS_DIFLAG_IMMUTABLE;
+                        if (vap->va_xflags & XFS_XFLAG_APPEND)
+                                di_flags |= XFS_DIFLAG_APPEND;
+                        if (vap->va_xflags & XFS_XFLAG_SYNC)
+                                di_flags |= XFS_DIFLAG_SYNC;
+                        if (vap->va_xflags & XFS_XFLAG_NOATIME)
+                                di_flags |= XFS_DIFLAG_NOATIME;
+                        if (vap->va_xflags & XFS_XFLAG_NODUMP)
+                                di_flags |= XFS_DIFLAG_NODUMP;
+                        if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+                                if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
+                                        di_flags |= XFS_DIFLAG_RTINHERIT;
+                                if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
+                                        di_flags |= XFS_DIFLAG_NOSYMLINKS;
+                        } else {
+                                if (vap->va_xflags & XFS_XFLAG_REALTIME) {
+                                        di_flags |= XFS_DIFLAG_REALTIME;
+                                        ip->i_iocore.io_flags |= XFS_IOCORE_RT;
+                                } else {
+                                        ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
+                                }
+                        }
+                        ip->i_d.di_flags = di_flags;
+                }
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                timeflags |= XFS_ICHGTIME_CHG;
+        }
+        /*
+         * Change file inode change time only if XFS_AT_CTIME set
+         * AND we have been called by a DMI function.
+         */
+        if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
+                ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
+                ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
+                ip->i_update_core = 1;
+                timeflags &= ~XFS_ICHGTIME_CHG;
+        }
+        /*
+         * Send out timestamp changes that need to be set to the
+         * current time.  Not done when called by a DMI function.
+         */
+        if (timeflags && !(flags & ATTR_DMI))
+                xfs_ichgtime(ip, timeflags);
+        XFS_STATS_INC(xs_ig_attrchg);
+        /*
+         * If this is a synchronous mount, make sure that the
+         * transaction goes to disk before returning to the user.
+         * This is slightly sub-optimal in that truncates require
+         * two sync transactions instead of one for wsync filesytems.
+         * One for the truncate and one for the timestamps since we
+         * don't want to change the timestamps unless we're sure the
+         * truncate worked.  Truncates are less than 1% of the laddis
+         * mix so this probably isn't worth the trouble to optimize.
+         */
+        code = 0;
+        if (tp) {
+                if (mp->m_flags & XFS_MOUNT_WSYNC)
+                        xfs_trans_set_sync(tp);
+                code = xfs_trans_commit(tp, commit_flags, NULL);
+        }
+        /*
+         * If the (regular) file's mandatory locking mode changed, then
+         * notify the vnode.  We do this under the inode lock to prevent
+         * racing calls to vop_vnode_change.
+         */
+        mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
+        if (mandlock_before != mandlock_after) {
+                VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
+                                 mandlock_after);
+        }
+        xfs_iunlock(ip, lock_flags);
+        /*
+         * Release any dquot(s) the inode had kept before chown.
+         */
+        XFS_QM_DQRELE(mp, olddquot1);
+        XFS_QM_DQRELE(mp, olddquot2);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        if (code) {
+                return code;
+        }
+        if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
+            !(flags & ATTR_DMI)) {
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
+                                        NULL, DM_RIGHT_NULL, NULL, NULL,
+                                        0, 0, AT_DELAY_FLAG(flags));
+        }
+        return 0;
+ abort_return:
+        commit_flags |= XFS_TRANS_ABORT;
+        /* FALLTHROUGH */
+ error_return:
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        if (tp) {
+                xfs_trans_cancel(tp, commit_flags);
+        }
+        if (lock_flags != 0) {
+                xfs_iunlock(ip, lock_flags);
+        }
+        return code;
+}
+/*
+ * xfs_access
+ * Null conversion from vnode mode bits to inode mode bits, as in efs.
+ */
+STATIC int
+xfs_access(
+        bhv_desc_t      *bdp,
+        int             mode,
+        cred_t          *credp)
+{
+        xfs_inode_t     *ip;
+        int             error;
+        vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
+                                               (inst_t *)__return_address);
+        ip = XFS_BHVTOI(bdp);
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        error = xfs_iaccess(ip, mode, credp);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return error;
+}
+/*
+ * xfs_readlink
+ *
+ */
+STATIC int
+xfs_readlink(
+        bhv_desc_t      *bdp,
+        uio_t           *uiop,
+        int             ioflags,
+        cred_t          *credp)
+{
+        xfs_inode_t     *ip;
+        int             count;
+        xfs_off_t       offset;
+        int             pathlen;
+        vnode_t         *vp;
+        int             error = 0;
+        xfs_mount_t     *mp;
+        int             nmaps;
+        xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+        xfs_daddr_t     d;
+        int             byte_cnt;
+        int             n;
+        xfs_buf_t       *bp;
+        vp = BHV_TO_VNODE(bdp);
+        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
+        offset = uiop->uio_offset;
+        count = uiop->uio_resid;
+        if (offset < 0) {
+                error = XFS_ERROR(EINVAL);
+                goto error_return;
+        }
+        if (count <= 0) {
+                error = 0;
+                goto error_return;
+        }
+        if (!(ioflags & IO_INVIS)) {
+                xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+        }
+        /*
+         * See if the symlink is stored inline.
+         */
+        pathlen = (int)ip->i_d.di_size;
+        if (ip->i_df.if_flags & XFS_IFINLINE) {
+                error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
+        }
+        else {
+                /*
+                 * Symlink not inline.  Call bmap to get it in.
+                 */
+                nmaps = SYMLINK_MAPS;
+                error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
+                                  0, NULL, 0, mval, &nmaps, NULL);
+                if (error) {
+                        goto error_return;
+                }
+                for (n = 0; n < nmaps; n++) {
+                        d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+                        byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+                        bp = xfs_buf_read(mp->m_ddev_targp, d,
+                                      BTOBB(byte_cnt), 0);
+                        error = XFS_BUF_GETERROR(bp);
+                        if (error) {
+                                xfs_ioerror_alert("xfs_readlink",
+                                          ip->i_mount, bp, XFS_BUF_ADDR(bp));
+                                xfs_buf_relse(bp);
+                                goto error_return;
+                        }
+                        if (pathlen < byte_cnt)
+                                byte_cnt = pathlen;
+                        pathlen -= byte_cnt;
+                        error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
+                        xfs_buf_relse (bp);
+                }
+        }
+error_return:
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        return error;
+}
+/*
+ * xfs_fsync
+ *
+ * This is called to sync the inode and its data out to disk.
+ * We need to hold the I/O lock while flushing the data, and
+ * the inode lock while flushing the inode.  The inode lock CANNOT
+ * be held while flushing the data, so acquire after we're done
+ * with that.
+ */
+STATIC int
+xfs_fsync(
+        bhv_desc_t      *bdp,
+        int             flag,
+        cred_t          *credp,
+        xfs_off_t       start,
+        xfs_off_t       stop)
+{
+        xfs_inode_t     *ip;
+        xfs_trans_t     *tp;
+        int             error;
+        vn_trace_entry(BHV_TO_VNODE(bdp),
+                        __FUNCTION__, (inst_t *)__return_address);
+        ip = XFS_BHVTOI(bdp);
+        ASSERT(start >= 0 && stop >= -1);
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return XFS_ERROR(EIO);
+        /*
+         * We always need to make sure that the required inode state
+         * is safe on disk.  The vnode might be clean but because
+         * of committed transactions that haven't hit the disk yet.
+         * Likewise, there could be unflushed non-transactional
+         * changes to the inode core that have to go to disk.
+         *
+         * The following code depends on one assumption:  that
+         * any transaction that changes an inode logs the core
+         * because it has to change some field in the inode core
+         * (typically nextents or nblocks).  That assumption
+         * implies that any transactions against an inode will
+         * catch any non-transactional updates.  If inode-altering
+         * transactions exist that violate this assumption, the
+         * code breaks.  Right now, it figures that if the involved
+         * update_* field is clear and the inode is unpinned, the
+         * inode is clean.  Either it's been flushed or it's been
+         * committed and the commit has hit the disk unpinning the inode.
+         * (Note that xfs_inode_item_format() called at commit clears
+         * the update_* fields.)
+         */
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        /* If we are flushing data then we care about update_size
+         * being set, otherwise we care about update_core
+         */
+        if ((flag & FSYNC_DATA) ?
+                        (ip->i_update_size == 0) :
+                        (ip->i_update_core == 0)) {
+                /*
+                 * Timestamps/size haven't changed since last inode
+                 * flush or inode transaction commit.  That means
+                 * either nothing got written or a transaction
+                 * committed which caught the updates.  If the
+                 * latter happened and the transaction hasn't
+                 * hit the disk yet, the inode will be still
+                 * be pinned.  If it is, force the log.
+                 */
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                if (xfs_ipincount(ip)) {
+                        xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+                                      XFS_LOG_FORCE |
+                                      ((flag & FSYNC_WAIT)
+                                       ? XFS_LOG_SYNC : 0));
+                }
+                error = 0;
+        } else  {
+                /*
+                 * Kick off a transaction to log the inode
+                 * core to get the updates.  Make it
+                 * sync if FSYNC_WAIT is passed in (which
+                 * is done by everybody but specfs).  The
+                 * sync transaction will also force the log.
+                 */
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+                if ((error = xfs_trans_reserve(tp, 0,
+                                XFS_FSYNC_TS_LOG_RES(ip->i_mount),
+                                0, 0, 0)))  {
+                        xfs_trans_cancel(tp, 0);
+                        return error;
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                /*
+                 * Note - it's possible that we might have pushed
+                 * ourselves out of the way during trans_reserve
+                 * which would flush the inode.  But there's no
+                 * guarantee that the inode buffer has actually
+                 * gone out yet (it's delwri).  Plus the buffer
+                 * could be pinned anyway if it's part of an
+                 * inode in another recent transaction.  So we
+                 * play it safe and fire off the transaction anyway.
+                 */
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                if (flag & FSYNC_WAIT)
+                        xfs_trans_set_sync(tp);
+                error = xfs_trans_commit(tp, 0, NULL);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+        return error;
+}
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof,
+ * when the link count isn't zero.
+ */
+STATIC int
+xfs_inactive_free_eofblocks(
+        xfs_mount_t     *mp,
+        xfs_inode_t     *ip)
+{
+        xfs_trans_t     *tp;
+        int             error;
+        xfs_fileoff_t   end_fsb;
+        xfs_fileoff_t   last_fsb;
+        xfs_filblks_t   map_len;
+        int             nimaps;
+        xfs_bmbt_irec_t imap;
+        /*
+         * Figure out if there are any blocks beyond the end
+         * of the file.  If not, then there is nothing to do.
+         */
+        end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
+        last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+        map_len = last_fsb - end_fsb;
+        if (map_len <= 0)
+                return (0);
+        nimaps = 1;
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
+                          NULL, 0, &imap, &nimaps, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (!error && (nimaps != 0) &&
+            (imap.br_startblock != HOLESTARTBLOCK)) {
+                /*
+                 * Attach the dquots to the inode up front.
+                 */
+                if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+                        return (error);
+                /*
+                 * There are blocks after the end of file.
+                 * Free them up now by truncating the file to
+                 * its current size.
+                 */
+                tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+                /*
+                 * Do the xfs_itruncate_start() call before
+                 * reserving any log space because
+                 * itruncate_start will call into the buffer
+                 * cache and we can't
+                 * do that within a transaction.
+                 */
+                xfs_ilock(ip, XFS_IOLOCK_EXCL);
+                xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+                                    ip->i_d.di_size);
+                error = xfs_trans_reserve(tp, 0,
+                                          XFS_ITRUNCATE_LOG_RES(mp),
+                                          0, XFS_TRANS_PERM_LOG_RES,
+                                          XFS_ITRUNCATE_LOG_COUNT);
+                if (error) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                        xfs_trans_cancel(tp, 0);
+                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                        return (error);
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip,
+                                XFS_IOLOCK_EXCL |
+                                XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                error = xfs_itruncate_finish(&tp, ip,
+                                             ip->i_d.di_size,
+                                             XFS_DATA_FORK,
+                                             0);
+                /*
+                 * If we get an error at this point we
+                 * simply don't bother truncating the file.
+                 */
+                if (error) {
+                        xfs_trans_cancel(tp,
+                                         (XFS_TRANS_RELEASE_LOG_RES |
+                                          XFS_TRANS_ABORT));
+                } else {
+                        error = xfs_trans_commit(tp,
+                                                XFS_TRANS_RELEASE_LOG_RES,
+                                                NULL);
+                }
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+        }
+        return (error);
+}
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+        xfs_inode_t     *ip,
+        xfs_trans_t     **tpp)
+{
+        xfs_buf_t       *bp;
+        int             committed;
+        int             done;
+        int             error;
+        xfs_fsblock_t   first_block;
+        xfs_bmap_free_t free_list;
+        int             i;
+        xfs_mount_t     *mp;
+        xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+        int             nmaps;
+        xfs_trans_t     *ntp;
+        int             size;
+        xfs_trans_t     *tp;
+        tp = *tpp;
+        mp = ip->i_mount;
+        ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+        /*
+         * We're freeing a symlink that has some
+         * blocks allocated to it.  Free the
+         * blocks here.  We know that we've got
+         * either 1 or 2 extents and that we can
+         * free them all in one bunmapi call.
+         */
+        ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+        if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                xfs_trans_cancel(tp, 0);
+                *tpp = NULL;
+                return error;
+        }
+        /*
+         * Lock the inode, fix the size, and join it to the transaction.
+         * Hold it so in the normal path, we still have it locked for
+         * the second transaction.  In the error paths we need it
+         * held so the cancel won't rele it, see below.
+         */
+        xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+        size = (int)ip->i_d.di_size;
+        ip->i_d.di_size = 0;
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        /*
+         * Find the block(s) so we can inval and unmap them.
+         */
+        done = 0;
+        XFS_BMAP_INIT(&free_list, &first_block);
+        nmaps = sizeof(mval) / sizeof(mval[0]);
+        if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
+                        XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
+                        &free_list)))
+                goto error0;
+        /*
+         * Invalidate the block(s).
+         */
+        for (i = 0; i < nmaps; i++) {
+                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+                        XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+                        XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+                xfs_trans_binval(tp, bp);
+        }
+        /*
+         * Unmap the dead block(s) to the free_list.
+         */
+        if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+                        &first_block, &free_list, &done)))
+                goto error1;
+        ASSERT(done);
+        /*
+         * Commit the first transaction.  This logs the EFI and the inode.
+         */
+        if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
+                goto error1;
+        /*
+         * The transaction must have been committed, since there were
+         * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
+         * The new tp has the extent freeing and EFDs.
+         */
+        ASSERT(committed);
+        /*
+         * The first xact was committed, so add the inode to the new one.
+         * Mark it dirty so it will be logged and moved forward in the log as
+         * part of every commit.
+         */
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        /*
+         * Get a new, empty transaction to return to our caller.
+         */
+        ntp = xfs_trans_dup(tp);
+        /*
+         * Commit the transaction containing extent freeing and EFD's.
+         * If we get an error on the commit here or on the reserve below,
+         * we need to unlock the inode since the new transaction doesn't
+         * have the inode attached.
+         */
+        error = xfs_trans_commit(tp, 0, NULL);
+        tp = ntp;
+        if (error) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                goto error0;
+        }
+        /*
+         * Remove the memory for extent descriptions (just bookkeeping).
+         */
+        if (ip->i_df.if_bytes)
+                xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+        ASSERT(ip->i_df.if_bytes == 0);
+        /*
+         * Put an itruncate log reservation in the new transaction
+         * for our caller.
+         */
+        if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                goto error0;
+        }
+        /*
+         * Return with the inode locked but not joined to the transaction.
+         */
+        *tpp = tp;
+        return 0;
+ error1:
+        xfs_bmap_cancel(&free_list);
+ error0:
+        /*
+         * Have to come here with the inode locked and either
+         * (held and in the transaction) or (not in the transaction).
+         * If the inode isn't held then cancel would iput it, but
+         * that's wrong since this is inactive and the vnode ref
+         * count is 0 already.
+         * Cancel won't do anything to the inode if held, but it still
+         * needs to be locked until the cancel is done, if it was
+         * joined to the transaction.
+         */
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+        *tpp = NULL;
+        return error;
+}
+STATIC int
+xfs_inactive_symlink_local(
+        xfs_inode_t     *ip,
+        xfs_trans_t     **tpp)
+{
+        int             error;
+        ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
+        /*
+         * We're freeing a symlink which fit into
+         * the inode.  Just free the memory used
+         * to hold the old symlink.
+         */
+        error = xfs_trans_reserve(*tpp, 0,
+                                  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
+                                  0, XFS_TRANS_PERM_LOG_RES,
+                                  XFS_ITRUNCATE_LOG_COUNT);
+        if (error) {
+                xfs_trans_cancel(*tpp, 0);
+                *tpp = NULL;
+                return (error);
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        /*
+         * Zero length symlinks _can_ exist.
+         */
+        if (ip->i_df.if_bytes > 0) {
+                xfs_idata_realloc(ip,
+                                  -(ip->i_df.if_bytes),
+                                  XFS_DATA_FORK);
+                ASSERT(ip->i_df.if_bytes == 0);
+        }
+        return (0);
+}
+/*
+ *
+ */
+STATIC int
+xfs_inactive_attrs(
+        xfs_inode_t     *ip,
+        xfs_trans_t     **tpp)
+{
+        xfs_trans_t     *tp;
+        int             error;
+        xfs_mount_t     *mp;
+        ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+        tp = *tpp;
+        mp = ip->i_mount;
+        ASSERT(ip->i_d.di_forkoff != 0);
+        xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        error = xfs_attr_inactive(ip);
+        if (error) {
+                *tpp = NULL;
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                return (error); /* goto out*/
+        }
+        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+        error = xfs_trans_reserve(tp, 0,
+                                  XFS_IFREE_LOG_RES(mp),
+                                  0, XFS_TRANS_PERM_LOG_RES,
+                                  XFS_INACTIVE_LOG_COUNT);
+        if (error) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                xfs_trans_cancel(tp, 0);
+                *tpp = NULL;
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                return (error);
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+        ASSERT(ip->i_d.di_anextents == 0);
+        *tpp = tp;
+        return (0);
+}
+STATIC int
+xfs_release(
+        bhv_desc_t      *bdp)
+{
+        xfs_inode_t     *ip;
+        vnode_t         *vp;
+        xfs_mount_t     *mp;
+        int             error;
+        vp = BHV_TO_VNODE(bdp);
+        ip = XFS_BHVTOI(bdp);
+        if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
+                return 0;
+        }
+        /* If this is a read-only mount, don't do this (would generate I/O) */
+        if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+                return 0;
+#ifdef HAVE_REFCACHE
+        /* If we are in the NFS reference cache then don't do this now */
+        if (ip->i_refcache)
+                return 0;
+#endif
+        mp = ip->i_mount;
+        if (ip->i_d.di_nlink != 0) {
+                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+                    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) {
+                        if ((error = xfs_inactive_free_eofblocks(mp, ip)))
+                                return (error);
+                        /* Update linux inode block count after free above */
+                        LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
+                                ip->i_d.di_nblocks + ip->i_delayed_blks);
+                }
+        }
+        return 0;
+}
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+STATIC int
+xfs_inactive(
+        bhv_desc_t      *bdp,
+        cred_t          *credp)
+{
+        xfs_inode_t     *ip;
+        vnode_t         *vp;
+        xfs_bmap_free_t free_list; 
+        xfs_fsblock_t   first_block;
+        int             committed;
+        xfs_trans_t     *tp;
+        xfs_mount_t     *mp;
+        int             error;
+        int             truncate;
+        vp = BHV_TO_VNODE(bdp);
+        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        ip = XFS_BHVTOI(bdp);
+        /*
+         * If the inode is already free, then there can be nothing
+         * to clean up here.
+         */
+        if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
+                ASSERT(ip->i_df.if_real_bytes == 0);
+                ASSERT(ip->i_df.if_broot_bytes == 0);
+                return VN_INACTIVE_CACHE;
+        }
+        /*
+         * Only do a truncate if it's a regular file with
+         * some actual space in it.  It's OK to look at the
+         * inode's fields without the lock because we're the
+         * only one with a reference to the inode.
+         */
+        truncate = ((ip->i_d.di_nlink == 0) &&
+            ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
+            ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
+        mp = ip->i_mount;
+        if (ip->i_d.di_nlink == 0 &&
+            DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
+                (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
+        }
+        error = 0;
+        /* If this is a read-only mount, don't do this (would generate I/O) */
+        if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+                goto out;
+        if (ip->i_d.di_nlink != 0) {
+                if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
+                     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+                     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+                    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) ||
+                     (ip->i_delayed_blks != 0))) {
+                        if ((error = xfs_inactive_free_eofblocks(mp, ip)))
+                                return (VN_INACTIVE_CACHE);
+                        /* Update linux inode block count after free above */
+                        LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
+                                ip->i_d.di_nblocks + ip->i_delayed_blks);
+                }
+                goto out;
+        }
+        ASSERT(ip->i_d.di_nlink == 0);
+        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+                return (VN_INACTIVE_CACHE);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+        if (truncate) {
+                /*
+                 * Do the xfs_itruncate_start() call before
+                 * reserving any log space because itruncate_start
+                 * will call into the buffer cache and we can't
+                 * do that within a transaction.
+                 */
+                xfs_ilock(ip, XFS_IOLOCK_EXCL);
+                xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
+                error = xfs_trans_reserve(tp, 0,
+                                          XFS_ITRUNCATE_LOG_RES(mp),
+                                          0, XFS_TRANS_PERM_LOG_RES,
+                                          XFS_ITRUNCATE_LOG_COUNT);
+                if (error) {
+                        /* Don't call itruncate_cleanup */
+                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                        xfs_trans_cancel(tp, 0);
+                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                        return (VN_INACTIVE_CACHE);
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                /*
+                 * normally, we have to run xfs_itruncate_finish sync.
+                 * But if filesystem is wsync and we're in the inactive
+                 * path, then we know that nlink == 0, and that the
+                 * xaction that made nlink == 0 is permanently committed
+                 * since xfs_remove runs as a synchronous transaction.
+                 */
+                error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
+                                (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
+                if (error) {
+                        xfs_trans_cancel(tp,
+                                XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+                        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                        return (VN_INACTIVE_CACHE);
+                }
+        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
+                /*
+                 * If we get an error while cleaning up a
+                 * symlink we bail out.
+                 */
+                error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
+                        xfs_inactive_symlink_rmt(ip, &tp) :
+                        xfs_inactive_symlink_local(ip, &tp);
+                if (error) {
+                        ASSERT(tp == NULL);
+                        return (VN_INACTIVE_CACHE);
+                }
+                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+        } else {
+                error = xfs_trans_reserve(tp, 0,
+                                          XFS_IFREE_LOG_RES(mp),
+                                          0, XFS_TRANS_PERM_LOG_RES,
+                                          XFS_INACTIVE_LOG_COUNT);
+                if (error) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(mp));
+                        xfs_trans_cancel(tp, 0);
+                        return (VN_INACTIVE_CACHE);
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+                xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+        }
+        /*
+         * If there are attributes associated with the file
+         * then blow them away now.  The code calls a routine
+         * that recursively deconstructs the attribute fork.
+         * We need to just commit the current transaction
+         * because we can't use it for xfs_attr_inactive().
+         */
+        if (ip->i_d.di_anextents > 0) {
+                error = xfs_inactive_attrs(ip, &tp);
+                /*
+                 * If we got an error, the transaction is already
+                 * cancelled, and the inode is unlocked. Just get out.
+                 */
+                 if (error)
+                         return (VN_INACTIVE_CACHE);
+        } else if (ip->i_afp) {
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+        }
+        /*
+         * Free the inode.
+         */
+        XFS_BMAP_INIT(&free_list, &first_block);
+        error = xfs_ifree(tp, ip, &free_list);
+        if (error) {
+                /*
+                 * If we fail to free the inode, shut down.  The cancel
+                 * might do that, we need to make sure.  Otherwise the
+                 * inode might be lost for a long time or forever.
+                 */
+                if (!XFS_FORCED_SHUTDOWN(mp)) {
+                        cmn_err(CE_NOTE,
+                "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
+                                error, mp->m_fsname);
+                        xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+                }
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+        } else {
+                /*
+                 * Credit the quota account(s). The inode is gone.
+                 */
+                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+                /*
+                 * Just ignore errors at this point.  There is
+                 * nothing we can do except to try to keep going.
+                 */
+                (void) xfs_bmap_finish(&tp,  &free_list, first_block,
+                                       &committed);
+                (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        }
+        /*
+         * Release the dquots held by inode, if any.
+         */
+        XFS_QM_DQDETACH(mp, ip);
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+ out:
+        return VN_INACTIVE_CACHE;
+}
+/*
+ * xfs_lookup
+ */
+STATIC int
+xfs_lookup(
+        bhv_desc_t              *dir_bdp,
+        vname_t                 *dentry,
+        vnode_t                 **vpp,
+        int                     flags,
+        vnode_t                 *rdir,
+        cred_t                  *credp)
+{
+        xfs_inode_t             *dp, *ip;
+        xfs_ino_t               e_inum;
+        int                     error;
+        uint                    lock_mode;
+        vnode_t                 *dir_vp;
+        dir_vp = BHV_TO_VNODE(dir_bdp);
+        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        dp = XFS_BHVTOI(dir_bdp);
+        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+                return XFS_ERROR(EIO);
+        lock_mode = xfs_ilock_map_shared(dp);
+        error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
+        if (!error) {
+                *vpp = XFS_ITOV(ip);
+                ITRACE(ip);
+        }
+        xfs_iunlock_map_shared(dp, lock_mode);
+        return error;
+}
+/*
+ * xfs_create (create a new file).
+ */
+STATIC int
+xfs_create(
+        bhv_desc_t              *dir_bdp,
+        vname_t                 *dentry,
+        vattr_t                 *vap,
+        vnode_t                 **vpp,
+        cred_t                  *credp)
+{
+        char                    *name = VNAME(dentry);
+        vnode_t                 *dir_vp;
+        xfs_inode_t             *dp, *ip;
+        vnode_t                 *vp=NULL;
+        xfs_trans_t             *tp;
+        xfs_mount_t             *mp;
+        xfs_dev_t               rdev;
+        int                     error;
+        xfs_bmap_free_t         free_list;
+        xfs_fsblock_t           first_block;
+        boolean_t               dp_joined_to_trans;
+        int                     dm_event_sent = 0;
+        uint                    cancel_flags;
+        int                     committed;
+        xfs_prid_t              prid;
+        struct xfs_dquot        *udqp, *gdqp;
+        uint                    resblks;
+        int                     dm_di_mode;
+        int                     namelen;
+        ASSERT(!*vpp);
+        dir_vp = BHV_TO_VNODE(dir_bdp);
+        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        dp = XFS_BHVTOI(dir_bdp);
+        mp = dp->i_mount;
+        dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+        namelen = VNAMELEN(dentry);
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
+                                dir_vp, DM_RIGHT_NULL, NULL,
+                                DM_RIGHT_NULL, name, NULL,
+                                dm_di_mode, 0, 0);
+                if (error)
+                        return error;
+                dm_event_sent = 1;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        /* Return through std_return after this point. */
+        udqp = gdqp = NULL;
+        if (vap->va_mask & XFS_AT_PROJID)
+                prid = (xfs_prid_t)vap->va_projid;
+        else
+                prid = (xfs_prid_t)dfltprid;
+        /*
+         * Make sure that we have allocated dquot(s) on disk.
+         */
+        error = XFS_QM_DQVOPALLOC(mp, dp,
+                        current_fsuid(credp), current_fsgid(credp),
+                        XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
+        if (error)
+                goto std_return;
+        ip = NULL;
+        dp_joined_to_trans = B_FALSE;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+        /*
+         * Initially assume that the file does not exist and
+         * reserve the resources for that case.  If that is not
+         * the case we'll drop the one we have and get a more
+         * appropriate transaction later.
+         */
+        error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+        if (error == ENOSPC) {
+                resblks = 0;
+                error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+        }
+        if (error) {
+                cancel_flags = 0;
+                dp = NULL;
+                goto error_return;
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        XFS_BMAP_INIT(&free_list, &first_block);
+        ASSERT(ip == NULL);
+        /*
+         * Reserve disk quota and the inode.
+         */
+        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+        if (error)
+                goto error_return;
+        if (resblks == 0 &&
+            (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
+                goto error_return;
+        rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
+        error = xfs_dir_ialloc(&tp, dp,
+                        MAKEIMODE(vap->va_type,vap->va_mode), 1,
+                        rdev, credp, prid, resblks > 0,
+                        &ip, &committed);
+        if (error) {
+                if (error == ENOSPC)
+                        goto error_return;
+                goto abort_return;
+        }
+        ITRACE(ip);
+        /*
+         * At this point, we've gotten a newly allocated inode.
+         * It is locked (and joined to the transaction).
+         */
+        ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+        /*
+         * Now we join the directory inode to the transaction.
+         * We do not do it earlier because xfs_dir_ialloc
+         * might commit the previous transaction (and release
+         * all the locks).
+         */
+        VN_HOLD(dir_vp);
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+        dp_joined_to_trans = B_TRUE;
+        error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
+                &first_block, &free_list,
+                resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+        if (error) {
+                ASSERT(error != ENOSPC);
+                goto abort_return;
+        }
+        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+        /*
+         * If this is a synchronous mount, make sure that the
+         * create transaction goes to disk before returning to
+         * the user.
+         */
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+                xfs_trans_set_sync(tp);
+        }
+        dp->i_gen++;
+        /*
+         * Attach the dquot(s) to the inodes and modify them incore.
+         * These ids of the inode couldn't have changed since the new
+         * inode has been locked ever since it was created.
+         */
+        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+        /*
+         * xfs_trans_commit normally decrements the vnode ref count
+         * when it unlocks the inode. Since we want to return the
+         * vnode to the caller, we bump the vnode ref count now.
+         */
+        IHOLD(ip);
+        vp = XFS_ITOV(ip);
+        error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+        if (error) {
+                xfs_bmap_cancel(&free_list);
+                goto abort_rele;
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        if (error) {
+                IRELE(ip);
+                tp = NULL;
+                goto error_return;
+        }
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        /*
+         * Propogate the fact that the vnode changed after the
+         * xfs_inode locks have been released.
+         */
+        VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
+        *vpp = vp;
+        /* Fallthrough to std_return with error = 0  */
+std_return:
+        if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
+                        DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+                                                        DM_EVENT_POSTCREATE)) {
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
+                        dir_vp, DM_RIGHT_NULL,
+                        *vpp ? vp:NULL,
+                        DM_RIGHT_NULL, name, NULL,
+                        dm_di_mode, error, 0);
+        }
+        return error;
+ abort_return:
+        cancel_flags |= XFS_TRANS_ABORT;
+        /* FALLTHROUGH */
+ error_return:
+        if (tp != NULL)
+                xfs_trans_cancel(tp, cancel_flags);
+        if (!dp_joined_to_trans && (dp != NULL))
+                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        goto std_return;
+ abort_rele:
+        /*
+         * Wait until after the current transaction is aborted to
+         * release the inode.  This prevents recursive transactions
+         * and deadlocks from xfs_inactive.
+         */
+        cancel_flags |= XFS_TRANS_ABORT;
+        xfs_trans_cancel(tp, cancel_flags);
+        IRELE(ip);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        goto std_return;
+}
+#ifdef DEBUG
+/*
+ * Some counters to see if (and how often) we are hitting some deadlock
+ * prevention code paths.
+ */
+int xfs_rm_locks;
+int xfs_rm_lock_delays;
+int xfs_rm_attempts;
+#endif
+/*
+ * The following routine will lock the inodes associated with the
+ * directory and the named entry in the directory. The locks are
+ * acquired in increasing inode number.
+ *
+ * If the entry is "..", then only the directory is locked. The
+ * vnode ref count will still include that from the .. entry in
+ * this case.
+ *
+ * There is a deadlock we need to worry about. If the locked directory is
+ * in the AIL, it might be blocking up the log. The next inode we lock
+ * could be already locked by another thread waiting for log space (e.g
+ * a permanent log reservation with a long running transaction (see
+ * xfs_itruncate_finish)). To solve this, we must check if the directory
+ * is in the ail and use lock_nowait. If we can't lock, we need to
+ * drop the inode lock on the directory and try again. xfs_iunlock will
+ * potentially push the tail if we were holding up the log.
+ */
+STATIC int
+xfs_lock_dir_and_entry(
+        xfs_inode_t     *dp,
+        vname_t         *dentry,
+        xfs_inode_t     *ip)    /* inode of entry 'name' */
+{
+        int             attempts;
+        xfs_ino_t       e_inum;
+        xfs_inode_t     *ips[2];
+        xfs_log_item_t  *lp;
+#ifdef DEBUG
+        xfs_rm_locks++;
+#endif
+        attempts = 0;
+again:
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        e_inum = ip->i_ino;
+        ITRACE(ip);
+        /*
+         * We want to lock in increasing inum. Since we've already
+         * acquired the lock on the directory, we may need to release
+         * if if the inum of the entry turns out to be less.
+         */
+        if (e_inum > dp->i_ino) {
+                /*
+                 * We are already in the right order, so just
+                 * lock on the inode of the entry.
+                 * We need to use nowait if dp is in the AIL.
+                 */
+                lp = (xfs_log_item_t *)dp->i_itemp;
+                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                                attempts++;
+#ifdef DEBUG
+                                xfs_rm_attempts++;
+#endif
+                                /*
+                                 * Unlock dp and try again.
+                                 * xfs_iunlock will try to push the tail
+                                 * if the inode is in the AIL.
+                                 */
+                                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+                                if ((attempts % 5) == 0) {
+                                        delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+                                        xfs_rm_lock_delays++;
+#endif
+                                }
+                                goto again;
+                        }
+                } else {
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                }
+        } else if (e_inum < dp->i_ino) {
+                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+                ips[0] = ip;
+                ips[1] = dp;
+                xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+        }
+        /* else  e_inum == dp->i_ino */
+        /*     This can happen if we're asked to lock /x/..
+         *     the entry is "..", which is also the parent directory.
+         */
+        return 0;
+}
+#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+/*
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
+ */
+void
+xfs_lock_inodes(
+        xfs_inode_t     **ips,
+        int             inodes,
+        int             first_locked,
+        uint            lock_mode)
+{
+        int             attempts = 0, i, j, try_lock;
+        xfs_log_item_t  *lp;
+        ASSERT(ips && (inodes >= 2)); /* we need at least two */
+        if (first_locked) {
+                try_lock = 1;
+                i = 1;
+        } else {
+                try_lock = 0;
+                i = 0;
+        }
+again:
+        for (; i < inodes; i++) {
+                ASSERT(ips[i]);
+                if (i && (ips[i] == ips[i-1]))  /* Already locked */
+                        continue;
+                /*
+                 * If try_lock is not set yet, make sure all locked inodes
+                 * are not in the AIL.
+                 * If any are, set try_lock to be used later.
+                 */
+                if (!try_lock) {
+                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
+                                lp = (xfs_log_item_t *)ips[j]->i_itemp;
+                                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                                        try_lock++;
+                                }
+                        }
+                }
+                /*
+                 * If any of the previous locks we have locked is in the AIL,
+                 * we must TRY to get the second and subsequent locks. If
+                 * we can't get any, we must release all we have
+                 * and try again.
+                 */
+                if (try_lock) {
+                        /* try_lock must be 0 if i is 0. */
+                        /*
+                         * try_lock means we have an inode locked
+                         * that is in the AIL.
+                         */
+                        ASSERT(i != 0);
+                        if (!xfs_ilock_nowait(ips[i], lock_mode)) {
+                                attempts++;
+                                /*
+                                 * Unlock all previous guys and try again.
+                                 * xfs_iunlock will try to push the tail
+                                 * if the inode is in the AIL.
+                                 */
+                                for(j = i - 1; j >= 0; j--) {
+                                        /*
+                                         * Check to see if we've already
+                                         * unlocked this one.
+                                         * Not the first one going back,
+                                         * and the inode ptr is the same.
+                                         */
+                                        if ((j != (i - 1)) && ips[j] ==
+                                                                ips[j+1])
+                                                continue;
+                                        xfs_iunlock(ips[j], lock_mode);
+                                }
+                                if ((attempts % 5) == 0) {
+                                        delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+                                        xfs_lock_delays++;
+#endif
+                                }
+                                i = 0;
+                                try_lock = 0;
+                                goto again;
+                        }
+                } else {
+                        xfs_ilock(ips[i], lock_mode);
+                }
+        }
+#ifdef DEBUG
+        if (attempts) {
+                if (attempts < 5) xfs_small_retries++;
+                else if (attempts < 100) xfs_middle_retries++;
+                else xfs_lots_retries++;
+        } else {
+                xfs_locked_n++;
+        }
+#endif
+}
+#ifdef  DEBUG
+#define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
+int remove_which_error_return = 0;
+#else /* ! DEBUG */
+#define REMOVE_DEBUG_TRACE(x)
+#endif  /* ! DEBUG */
+/*
+ * xfs_remove
+ *
+ */
+STATIC int
+xfs_remove(
+        bhv_desc_t              *dir_bdp,
+        vname_t                 *dentry,
+        cred_t                  *credp)
+{
+        vnode_t                 *dir_vp;
+        char                    *name = VNAME(dentry);
+        xfs_inode_t             *dp, *ip;
+        xfs_trans_t             *tp = NULL;
+        xfs_mount_t             *mp;
+        int                     error = 0;
+        xfs_bmap_free_t         free_list;
+        xfs_fsblock_t           first_block;
+        int                     cancel_flags;
+        int                     committed;
+        int                     dm_di_mode = 0;
+        int                     link_zero;
+        uint                    resblks;
+        int                     namelen;
+        dir_vp = BHV_TO_VNODE(dir_bdp);
+        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        dp = XFS_BHVTOI(dir_bdp);
+        mp = dp->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        namelen = VNAMELEN(dentry);
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
+                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
+                                        name, NULL, 0, 0, 0);
+                if (error)
+                        return error;
+        }
+        /* From this point on, return through std_return */
+        ip = NULL;
+        /*
+         * We need to get a reference to ip before we get our log
+         * reservation. The reason for this is that we cannot call
+         * xfs_iget for an inode for which we do not have a reference
+         * once we've acquired a log reservation. This is because the
+         * inode we are trying to get might be in xfs_inactive going
+         * for a log reservation. Since we'll have to wait for the
+         * inactive code to complete before returning from xfs_iget,
+         * we need to make sure that we don't have log space reserved
+         * when we call xfs_iget.  Instead we get an unlocked referece
+         * to the inode before getting our log reservation.
+         */
+        error = xfs_get_dir_entry(dentry, &ip);
+        if (error) {
+                REMOVE_DEBUG_TRACE(__LINE__);
+                goto std_return;
+        }
+        dm_di_mode = ip->i_d.di_mode;
+        vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
+        ITRACE(ip);
+        error = XFS_QM_DQATTACH(mp, dp, 0);
+        if (!error && dp != ip)
+                error = XFS_QM_DQATTACH(mp, ip, 0);
+        if (error) {
+                REMOVE_DEBUG_TRACE(__LINE__);
+                IRELE(ip);
+                goto std_return;
+        }
+        tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        /*
+         * We try to get the real space reservation first,
+         * allowing for directory btree deletion(s) implying
+         * possible bmap insert(s).  If we can't get the space
+         * reservation then we use 0 instead, and avoid the bmap
+         * btree insert(s) in the directory code by, if the bmap
+         * insert tries to happen, instead trimming the LAST
+         * block from the directory.
+         */
+        resblks = XFS_REMOVE_SPACE_RES(mp);
+        error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+        if (error == ENOSPC) {
+                resblks = 0;
+                error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+        }
+        if (error) {
+                ASSERT(error != ENOSPC);
+                REMOVE_DEBUG_TRACE(__LINE__);
+                xfs_trans_cancel(tp, 0);
+                IRELE(ip);
+                return error;
+        }
+        error = xfs_lock_dir_and_entry(dp, dentry, ip);
+        if (error) {
+                REMOVE_DEBUG_TRACE(__LINE__);
+                xfs_trans_cancel(tp, cancel_flags);
+                IRELE(ip);
+                goto std_return;
+        }
+        /*
+         * At this point, we've gotten both the directory and the entry
+         * inodes locked.
+         */
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+        if (dp != ip) {
+                /*
+                 * Increment vnode ref count only in this case since
+                 * there's an extra vnode reference in the case where
+                 * dp == ip.
+                 */
+                IHOLD(dp);
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        }
+        /*
+         * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
+         */
+        XFS_BMAP_INIT(&free_list, &first_block);
+        error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
+                &first_block, &free_list, 0);
+        if (error) {
+                ASSERT(error != ENOENT);
+                REMOVE_DEBUG_TRACE(__LINE__);
+                goto error1;
+        }
+        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        dp->i_gen++;
+        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+        error = xfs_droplink(tp, ip);
+        if (error) {
+                REMOVE_DEBUG_TRACE(__LINE__);
+                goto error1;
+        }
+        /* Determine if this is the last link while
+         * we are in the transaction.
+         */
+        link_zero = (ip)->i_d.di_nlink==0;
+        /*
+         * Take an extra ref on the inode so that it doesn't
+         * go to xfs_inactive() from within the commit.
+         */
+        IHOLD(ip);
+        /*
+         * If this is a synchronous mount, make sure that the
+         * remove transaction goes to disk before returning to
+         * the user.
+         */
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+                xfs_trans_set_sync(tp);
+        }
+        error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+        if (error) {
+                REMOVE_DEBUG_TRACE(__LINE__);
+                goto error_rele;
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        if (error) {
+                IRELE(ip);
+                goto std_return;
+        }
+        /*
+         * Before we drop our extra reference to the inode, purge it
+         * from the refcache if it is there.  By waiting until afterwards
+         * to do the IRELE, we ensure that we won't go inactive in the
+         * xfs_refcache_purge_ip routine (although that would be OK).
+         */
+        xfs_refcache_purge_ip(ip);
+        vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
+        /*
+         * Let interposed file systems know about removed links.
+         */
+        VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
+        IRELE(ip);
+/*      Fall through to std_return with error = 0 */
+ std_return:
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
+                                                DM_EVENT_POSTREMOVE)) {
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
+                                dir_vp, DM_RIGHT_NULL,
+                                NULL, DM_RIGHT_NULL,
+                                name, NULL, dm_di_mode, error, 0);
+        }
+        return error;
+ error1:
+        xfs_bmap_cancel(&free_list);
+        cancel_flags |= XFS_TRANS_ABORT;
+        xfs_trans_cancel(tp, cancel_flags);
+        goto std_return;
+ error_rele:
+        /*
+         * In this case make sure to not release the inode until after
+         * the current transaction is aborted.  Releasing it beforehand
+         * can cause us to go to xfs_inactive and start a recursive
+         * transaction which can easily deadlock with the current one.
+         */
+        xfs_bmap_cancel(&free_list);
+        cancel_flags |= XFS_TRANS_ABORT;
+        xfs_trans_cancel(tp, cancel_flags);
+        /*
+         * Before we drop our extra reference to the inode, purge it
+         * from the refcache if it is there.  By waiting until afterwards
+         * to do the IRELE, we ensure that we won't go inactive in the
+         * xfs_refcache_purge_ip routine (although that would be OK).
+         */
+        xfs_refcache_purge_ip(ip);
+        IRELE(ip);
+        goto std_return;
+}
+/*
+ * xfs_link
+ *
+ */
+STATIC int
+xfs_link(
+        bhv_desc_t              *target_dir_bdp,
+        vnode_t                 *src_vp,
+        vname_t                 *dentry,
+        cred_t                  *credp)
+{
+        xfs_inode_t             *tdp, *sip;
+        xfs_trans_t             *tp;
+        xfs_mount_t             *mp;
+        xfs_inode_t             *ips[2];
+        int                     error;
+        xfs_bmap_free_t         free_list;
+        xfs_fsblock_t           first_block;
+        int                     cancel_flags;
+        int                     committed;
+        vnode_t                 *target_dir_vp;
+        bhv_desc_t              *src_bdp;
+        int                     resblks;
+        char                    *target_name = VNAME(dentry);
+        int                     target_namelen;
+        target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
+        vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
+        target_namelen = VNAMELEN(dentry);
+        if (src_vp->v_type == VDIR)
+                return XFS_ERROR(EPERM);
+        /*
+         * For now, manually find the XFS behavior descriptor for
+         * the source vnode.  If it doesn't exist then something
+         * is wrong and we should just return an error.
+         * Eventually we need to figure out how link is going to
+         * work in the face of stacked vnodes.
+         */
+        src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
+        if (src_bdp == NULL) {
+                return XFS_ERROR(EXDEV);
+        }
+        sip = XFS_BHVTOI(src_bdp);
+        tdp = XFS_BHVTOI(target_dir_bdp);
+        mp = tdp->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
+                                        target_dir_vp, DM_RIGHT_NULL,
+                                        src_vp, DM_RIGHT_NULL,
+                                        target_name, NULL, 0, 0, 0);
+                if (error)
+                        return error;
+        }
+        /* Return through std_return after this point. */
+        error = XFS_QM_DQATTACH(mp, sip, 0);
+        if (!error && sip != tdp)
+                error = XFS_QM_DQATTACH(mp, tdp, 0);
+        if (error)
+                goto std_return;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+        error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+        if (error == ENOSPC) {
+                resblks = 0;
+                error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+        }
+        if (error) {
+                cancel_flags = 0;
+                goto error_return;
+        }
+        if (sip->i_ino < tdp->i_ino) {
+                ips[0] = sip;
+                ips[1] = tdp;
+        } else {
+                ips[0] = tdp;
+                ips[1] = sip;
+        }
+        xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+        /*
+         * Increment vnode ref counts since xfs_trans_commit &
+         * xfs_trans_cancel will both unlock the inodes and
+         * decrement the associated ref counts.
+         */
+        VN_HOLD(src_vp);
+        VN_HOLD(target_dir_vp);
+        xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+        /*
+         * If the source has too many links, we can't make any more to it.
+         */
+        if (sip->i_d.di_nlink >= XFS_MAXLINK) {
+                error = XFS_ERROR(EMLINK);
+                goto error_return;
+        }
+        if (resblks == 0 &&
+            (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
+                        target_namelen)))
+                goto error_return;
+        XFS_BMAP_INIT(&free_list, &first_block);
+        error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
+                                   sip->i_ino, &first_block, &free_list,
+                                   resblks);
+        if (error)
+                goto abort_return;
+        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        tdp->i_gen++;
+        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+        error = xfs_bumplink(tp, sip);
+        if (error) {
+                goto abort_return;
+        }
+        /*
+         * If this is a synchronous mount, make sure that the
+         * link transaction goes to disk before returning to
+         * the user.
+         */
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+                xfs_trans_set_sync(tp);
+        }
+        error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
+        if (error) {
+                xfs_bmap_cancel(&free_list);
+                goto abort_return;
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        if (error) {
+                goto std_return;
+        }
+        /* Fall through to std_return with error = 0. */
+std_return:
+        if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
+                                                DM_EVENT_POSTLINK)) {
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
+                                target_dir_vp, DM_RIGHT_NULL,
+                                src_vp, DM_RIGHT_NULL,
+                                target_name, NULL, 0, error, 0);
+        }
+        return error;
+ abort_return:
+        cancel_flags |= XFS_TRANS_ABORT;
+        /* FALLTHROUGH */
+ error_return:
+        xfs_trans_cancel(tp, cancel_flags);
+        goto std_return;
+}
+/*
+ * xfs_mkdir
+ *
+ */
+STATIC int
+xfs_mkdir(
+        bhv_desc_t              *dir_bdp,
+        vname_t                 *dentry,
+        vattr_t                 *vap,
+        vnode_t                 **vpp,
+        cred_t                  *credp)
+{
+        char                    *dir_name = VNAME(dentry);
+        xfs_inode_t             *dp;
+        xfs_inode_t             *cdp;   /* inode of created dir */
+        vnode_t                 *cvp;   /* vnode of created dir */
+        xfs_trans_t             *tp;
+        xfs_mount_t             *mp;
+        int                     cancel_flags;
+        int                     error;
+        int                     committed;
+        xfs_bmap_free_t         free_list;
+        xfs_fsblock_t           first_block;
+        vnode_t                 *dir_vp;
+        boolean_t               dp_joined_to_trans;
+        boolean_t               created = B_FALSE;
+        int                     dm_event_sent = 0;
+        xfs_prid_t              prid;
+        struct xfs_dquot        *udqp, *gdqp;
+        uint                    resblks;
+        int                     dm_di_mode;
+        int                     dir_namelen;
+        dir_vp = BHV_TO_VNODE(dir_bdp);
+        dp = XFS_BHVTOI(dir_bdp);
+        mp = dp->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        dir_namelen = VNAMELEN(dentry);
+        tp = NULL;
+        dp_joined_to_trans = B_FALSE;
+        dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
+                                        dir_vp, DM_RIGHT_NULL, NULL,
+                                        DM_RIGHT_NULL, dir_name, NULL,
+                                        dm_di_mode, 0, 0);
+                if (error)
+                        return error;
+                dm_event_sent = 1;
+        }
+        /* Return through std_return after this point. */
+        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        mp = dp->i_mount;
+        udqp = gdqp = NULL;
+        if (vap->va_mask & XFS_AT_PROJID)
+                prid = (xfs_prid_t)vap->va_projid;
+        else
+                prid = (xfs_prid_t)dfltprid;
+        /*
+         * Make sure that we have allocated dquot(s) on disk.
+         */
+        error = XFS_QM_DQVOPALLOC(mp, dp,
+                        current_fsuid(credp), current_fsgid(credp),
+                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+        if (error)
+                goto std_return;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+        error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
+                                  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
+        if (error == ENOSPC) {
+                resblks = 0;
+                error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
+                                          XFS_TRANS_PERM_LOG_RES,
+                                          XFS_MKDIR_LOG_COUNT);
+        }
+        if (error) {
+                cancel_flags = 0;
+                dp = NULL;
+                goto error_return;
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        /*
+         * Check for directory link count overflow.
+         */
+        if (dp->i_d.di_nlink >= XFS_MAXLINK) {
+                error = XFS_ERROR(EMLINK);
+                goto error_return;
+        }
+        /*
+         * Reserve disk quota and the inode.
+         */
+        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+        if (error)
+                goto error_return;
+        if (resblks == 0 &&
+            (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
+                goto error_return;
+        /*
+         * create the directory inode.
+         */
+        error = xfs_dir_ialloc(&tp, dp,
+                        MAKEIMODE(vap->va_type,vap->va_mode), 2,
+                        0, credp, prid, resblks > 0,
+                &cdp, NULL);
+        if (error) {
+                if (error == ENOSPC)
+                        goto error_return;
+                goto abort_return;
+        }
+        ITRACE(cdp);
+        /*
+         * Now we add the directory inode to the transaction.
+         * We waited until now since xfs_dir_ialloc might start
+         * a new transaction.  Had we joined the transaction
+         * earlier, the locks might have gotten released.
+         */
+        VN_HOLD(dir_vp);
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+        dp_joined_to_trans = B_TRUE;
+        XFS_BMAP_INIT(&free_list, &first_block);
+        error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
+                        cdp->i_ino, &first_block, &free_list,
+                        resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+        if (error) {
+                ASSERT(error != ENOSPC);
+                goto error1;
+        }
+        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        /*
+         * Bump the in memory version number of the parent directory
+         * so that other processes accessing it will recognize that
+         * the directory has changed.
+         */
+        dp->i_gen++;
+        error = XFS_DIR_INIT(mp, tp, cdp, dp);
+        if (error) {
+                goto error2;
+        }
+        cdp->i_gen = 1;
+        error = xfs_bumplink(tp, dp);
+        if (error) {
+                goto error2;
+        }
+        cvp = XFS_ITOV(cdp);
+        created = B_TRUE;
+        *vpp = cvp;
+        IHOLD(cdp);
+        /*
+         * Attach the dquots to the new inode and modify the icount incore.
+         */
+        XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
+        /*
+         * If this is a synchronous mount, make sure that the
+         * mkdir transaction goes to disk before returning to
+         * the user.
+         */
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+                xfs_trans_set_sync(tp);
+        }
+        error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+        if (error) {
+                IRELE(cdp);
+                goto error2;
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        if (error) {
+                IRELE(cdp);
+        }
+        /* Fall through to std_return with error = 0 or errno from
+         * xfs_trans_commit. */
+std_return:
+        if ( (created || (error != 0 && dm_event_sent != 0)) &&
+                        DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+                                                DM_EVENT_POSTCREATE)) {
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
+                                        dir_vp, DM_RIGHT_NULL,
+                                        created ? XFS_ITOV(cdp):NULL,
+                                        DM_RIGHT_NULL,
+                                        dir_name, NULL,
+                                        dm_di_mode, error, 0);
+        }
+        return error;
+ error2:
+ error1:
+        xfs_bmap_cancel(&free_list);
+ abort_return:
+        cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+        xfs_trans_cancel(tp, cancel_flags);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        if (!dp_joined_to_trans && (dp != NULL)) {
+                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        }
+        goto std_return;
+}
+/*
+ * xfs_rmdir
+ *
+ */
+STATIC int
+xfs_rmdir(
+        bhv_desc_t              *dir_bdp,
+        vname_t                 *dentry,
+        cred_t                  *credp)
+{
+        char                    *name = VNAME(dentry);
+        xfs_inode_t             *dp;
+        xfs_inode_t             *cdp;   /* child directory */
+        xfs_trans_t             *tp;
+        xfs_mount_t             *mp;
+        int                     error;
+        xfs_bmap_free_t         free_list;
+        xfs_fsblock_t           first_block;
+        int                     cancel_flags;
+        int                     committed;
+        vnode_t                 *dir_vp;
+        int                     dm_di_mode = 0;
+        int                     last_cdp_link;
+        int                     namelen;
+        uint                    resblks;
+        dir_vp = BHV_TO_VNODE(dir_bdp);
+        dp = XFS_BHVTOI(dir_bdp);
+        mp = dp->i_mount;
+        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
+                return XFS_ERROR(EIO);
+        namelen = VNAMELEN(dentry);
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
+                                        dir_vp, DM_RIGHT_NULL,
+                                        NULL, DM_RIGHT_NULL,
+                                        name, NULL, 0, 0, 0);
+                if (error)
+                        return XFS_ERROR(error);
+        }
+        /* Return through std_return after this point. */
+        cdp = NULL;
+        /*
+         * We need to get a reference to cdp before we get our log
+         * reservation.  The reason for this is that we cannot call
+         * xfs_iget for an inode for which we do not have a reference
+         * once we've acquired a log reservation.  This is because the
+         * inode we are trying to get might be in xfs_inactive going
+         * for a log reservation.  Since we'll have to wait for the
+         * inactive code to complete before returning from xfs_iget,
+         * we need to make sure that we don't have log space reserved
+         * when we call xfs_iget.  Instead we get an unlocked referece
+         * to the inode before getting our log reservation.
+         */
+        error = xfs_get_dir_entry(dentry, &cdp);
+        if (error) {
+                REMOVE_DEBUG_TRACE(__LINE__);
+                goto std_return;
+        }
+        mp = dp->i_mount;
+        dm_di_mode = cdp->i_d.di_mode;
+        /*
+         * Get the dquots for the inodes.
+         */
+        error = XFS_QM_DQATTACH(mp, dp, 0);
+        if (!error && dp != cdp)
+                error = XFS_QM_DQATTACH(mp, cdp, 0);
+        if (error) {
+                IRELE(cdp);
+                REMOVE_DEBUG_TRACE(__LINE__);
+                goto std_return;
+        }
+        tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        /*
+         * We try to get the real space reservation first,
+         * allowing for directory btree deletion(s) implying
+         * possible bmap insert(s).  If we can't get the space
+         * reservation then we use 0 instead, and avoid the bmap
+         * btree insert(s) in the directory code by, if the bmap
+         * insert tries to happen, instead trimming the LAST
+         * block from the directory.
+         */
+        resblks = XFS_REMOVE_SPACE_RES(mp);
+        error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
+        if (error == ENOSPC) {
+                resblks = 0;
+                error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
+        }
+        if (error) {
+                ASSERT(error != ENOSPC);
+                cancel_flags = 0;
+                IRELE(cdp);
+                goto error_return;
+        }
+        XFS_BMAP_INIT(&free_list, &first_block);
+        /*
+         * Now lock the child directory inode and the parent directory
+         * inode in the proper order.  This will take care of validating
+         * that the directory entry for the child directory inode has
+         * not changed while we were obtaining a log reservation.
+         */
+        error = xfs_lock_dir_and_entry(dp, dentry, cdp);
+        if (error) {
+                xfs_trans_cancel(tp, cancel_flags);
+                IRELE(cdp);
+                goto std_return;
+        }
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+        if (dp != cdp) {
+                /*
+                 * Only increment the parent directory vnode count if
+                 * we didn't bump it in looking up cdp.  The only time
+                 * we don't bump it is when we're looking up ".".
+                 */
+                VN_HOLD(dir_vp);
+        }
+        ITRACE(cdp);
+        xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
+        ASSERT(cdp->i_d.di_nlink >= 2);
+        if (cdp->i_d.di_nlink != 2) {
+                error = XFS_ERROR(ENOTEMPTY);
+                goto error_return;
+        }
+        if (!XFS_DIR_ISEMPTY(mp, cdp)) {
+                error = XFS_ERROR(ENOTEMPTY);
+                goto error_return;
+        }
+        error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
+                &first_block, &free_list, resblks);
+        if (error) {
+                goto error1;
+        }
+        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        /*
+         * Bump the in memory generation count on the parent
+         * directory so that other can know that it has changed.
+         */
+        dp->i_gen++;
+        /*
+         * Drop the link from cdp's "..".
+         */
+        error = xfs_droplink(tp, dp);
+        if (error) {
+                goto error1;
+        }
+        /*
+         * Drop the link from dp to cdp.
+         */
+        error = xfs_droplink(tp, cdp);
+        if (error) {
+                goto error1;
+        }
+        /*
+         * Drop the "." link from cdp to self.
+         */
+        error = xfs_droplink(tp, cdp);
+        if (error) {
+                goto error1;
+        }
+        /* Determine these before committing transaction */
+        last_cdp_link = (cdp)->i_d.di_nlink==0;
+        /*
+         * Take an extra ref on the child vnode so that it
+         * does not go to xfs_inactive() from within the commit.
+         */
+        IHOLD(cdp);
+        /*
+         * If this is a synchronous mount, make sure that the
+         * rmdir transaction goes to disk before returning to
+         * the user.
+         */
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+                xfs_trans_set_sync(tp);
+        }
+        error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
+        if (error) {
+                xfs_bmap_cancel(&free_list);
+                xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+                                 XFS_TRANS_ABORT));
+                IRELE(cdp);
+                goto std_return;
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        if (error) {
+                IRELE(cdp);
+                goto std_return;
+        }
+        /*
+         * Let interposed file systems know about removed links.
+         */
+        VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
+        IRELE(cdp);
+        /* Fall through to std_return with error = 0 or the errno
+         * from xfs_trans_commit. */
+std_return:
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
+                                        dir_vp, DM_RIGHT_NULL,
+                                        NULL, DM_RIGHT_NULL,
+                                        name, NULL, dm_di_mode,
+                                        error, 0);
+        }
+        return error;
+ error1:
+        xfs_bmap_cancel(&free_list);
+        cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+        xfs_trans_cancel(tp, cancel_flags);
+        goto std_return;
+}
+/*
+ * xfs_readdir
+ *
+ * Read dp's entries starting at uiop->uio_offset and translate them into
+ * bufsize bytes worth of struct dirents starting at bufbase.
+ */
+STATIC int
+xfs_readdir(
+        bhv_desc_t      *dir_bdp,
+        uio_t           *uiop,
+        cred_t          *credp,
+        int             *eofp)
+{
+        xfs_inode_t     *dp;
+        xfs_trans_t     *tp = NULL;
+        int             error = 0;
+        uint            lock_mode;
+        xfs_off_t       start_offset;
+        vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
+                                               (inst_t *)__return_address);
+        dp = XFS_BHVTOI(dir_bdp);
+        if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
+                return XFS_ERROR(EIO);
+        }
+        lock_mode = xfs_ilock_map_shared(dp);
+        start_offset = uiop->uio_offset;
+        error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
+        if (start_offset != uiop->uio_offset) {
+                xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
+        }
+        xfs_iunlock_map_shared(dp, lock_mode);
+        return error;
+}
+/*
+ * xfs_symlink
+ *
+ */
+STATIC int
+xfs_symlink(
+        bhv_desc_t              *dir_bdp,
+        vname_t                 *dentry,
+        vattr_t                 *vap,
+        char                    *target_path,
+        vnode_t                 **vpp,
+        cred_t                  *credp)
+{
+        xfs_trans_t             *tp;
+        xfs_mount_t             *mp;
+        xfs_inode_t             *dp;
+        xfs_inode_t             *ip;
+        int                     error;
+        int                     pathlen;
+        xfs_bmap_free_t         free_list;
+        xfs_fsblock_t           first_block;
+        boolean_t               dp_joined_to_trans;
+        vnode_t                 *dir_vp;
+        uint                    cancel_flags;
+        int                     committed;
+        xfs_fileoff_t           first_fsb;
+        xfs_filblks_t           fs_blocks;
+        int                     nmaps;
+        xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
+        xfs_daddr_t             d;
+        char                    *cur_chunk;
+        int                     byte_cnt;
+        int                     n;
+        xfs_buf_t               *bp;
+        xfs_prid_t              prid;
+        struct xfs_dquot        *udqp, *gdqp;
+        uint                    resblks;
+        char                    *link_name = VNAME(dentry);
+        int                     link_namelen;
+        *vpp = NULL;
+        dir_vp = BHV_TO_VNODE(dir_bdp);
+        dp = XFS_BHVTOI(dir_bdp);
+        dp_joined_to_trans = B_FALSE;
+        error = 0;
+        ip = NULL;
+        tp = NULL;
+        vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
+        mp = dp->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        link_namelen = VNAMELEN(dentry);
+        /*
+         * Check component lengths of the target path name.
+         */
+        pathlen = strlen(target_path);
+        if (pathlen >= MAXPATHLEN)      /* total string too long */
+                return XFS_ERROR(ENAMETOOLONG);
+        if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
+                int len, total;
+                char *path;
+                for(total = 0, path = target_path; total < pathlen;) {
+                        /*
+                         * Skip any slashes.
+                         */
+                        while(*path == '/') {
+                                total++;
+                                path++;
+                        }
+                        /*
+                         * Count up to the next slash or end of path.
+                         * Error out if the component is bigger than MAXNAMELEN.
+                         */
+                        for(len = 0; *path != '/' && total < pathlen;total++, path++) {
+                                if (++len >= MAXNAMELEN) {
+                                        error = ENAMETOOLONG;
+                                        return error;
+                                }
+                        }
+                }
+        }
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
+                                        DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
+                                        link_name, target_path, 0, 0, 0);
+                if (error)
+                        return error;
+        }
+        /* Return through std_return after this point. */
+        udqp = gdqp = NULL;
+        if (vap->va_mask & XFS_AT_PROJID)
+                prid = (xfs_prid_t)vap->va_projid;
+        else
+                prid = (xfs_prid_t)dfltprid;
+        /*
+         * Make sure that we have allocated dquot(s) on disk.
+         */
+        error = XFS_QM_DQVOPALLOC(mp, dp,
+                        current_fsuid(credp), current_fsgid(credp),
+                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
+        if (error)
+                goto std_return;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        /*
+         * The symlink will fit into the inode data fork?
+         * There can't be any attributes so we get the whole variable part.
+         */
+        if (pathlen <= XFS_LITINO(mp))
+                fs_blocks = 0;
+        else
+                fs_blocks = XFS_B_TO_FSB(mp, pathlen);
+        resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+        error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
+                        XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+        if (error == ENOSPC && fs_blocks == 0) {
+                resblks = 0;
+                error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+        }
+        if (error) {
+                cancel_flags = 0;
+                dp = NULL;
+                goto error_return;
+        }
+        xfs_ilock(dp, XFS_ILOCK_EXCL);
+        /*
+         * Check whether the directory allows new symlinks or not.
+         */
+        if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
+                error = XFS_ERROR(EPERM);
+                goto error_return;
+        }
+        /*
+         * Reserve disk quota : blocks and inode.
+         */
+        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+        if (error)
+                goto error_return;
+        /*
+         * Check for ability to enter directory entry, if no space reserved.
+         */
+        if (resblks == 0 &&
+            (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
+                goto error_return;
+        /*
+         * Initialize the bmap freelist prior to calling either
+         * bmapi or the directory create code.
+         */
+        XFS_BMAP_INIT(&free_list, &first_block);
+        /*
+         * Allocate an inode for the symlink.
+         */
+        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
+                               1, 0, credp, prid, resblks > 0, &ip, NULL);
+        if (error) {
+                if (error == ENOSPC)
+                        goto error_return;
+                goto error1;
+        }
+        ITRACE(ip);
+        VN_HOLD(dir_vp);
+        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+        dp_joined_to_trans = B_TRUE;
+        /*
+         * Also attach the dquot(s) to it, if applicable.
+         */
+        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+        if (resblks)
+                resblks -= XFS_IALLOC_SPACE_RES(mp);
+        /*
+         * If the symlink will fit into the inode, write it inline.
+         */
+        if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+                xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+                memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
+                ip->i_d.di_size = pathlen;
+                /*
+                 * The inode was initially created in extent format.
+                 */
+                ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+                ip->i_df.if_flags |= XFS_IFINLINE;
+                ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+        } else {
+                first_fsb = 0;
+                nmaps = SYMLINK_MAPS;
+                error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
+                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
+                                  &first_block, resblks, mval, &nmaps,
+                                  &free_list);
+                if (error) {
+                        goto error1;
+                }
+                if (resblks)
+                        resblks -= fs_blocks;
+                ip->i_d.di_size = pathlen;
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                cur_chunk = target_path;
+                for (n = 0; n < nmaps; n++) {
+                        d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+                        byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+                                               BTOBB(byte_cnt), 0);
+                        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+                        if (pathlen < byte_cnt) {
+                                byte_cnt = pathlen;
+                        }
+                        pathlen -= byte_cnt;
+                        memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
+                        cur_chunk += byte_cnt;
+                        xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
+                }
+        }
+        /*
+         * Create the directory entry for the symlink.
+         */
+        error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
+                        ip->i_ino, &first_block, &free_list, resblks);
+        if (error) {
+                goto error1;
+        }
+        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+        /*
+         * Bump the in memory version number of the parent directory
+         * so that other processes accessing it will recognize that
+         * the directory has changed.
+         */
+        dp->i_gen++;
+        /*
+         * If this is a synchronous mount, make sure that the
+         * symlink transaction goes to disk before returning to
+         * the user.
+         */
+        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+                xfs_trans_set_sync(tp);
+        }
+        /*
+         * xfs_trans_commit normally decrements the vnode ref count
+         * when it unlocks the inode. Since we want to return the
+         * vnode to the caller, we bump the vnode ref count now.
+         */
+        IHOLD(ip);
+        error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+        if (error) {
+                goto error2;
+        }
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        /* Fall through to std_return with error = 0 or errno from
+         * xfs_trans_commit     */
+std_return:
+        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+                             DM_EVENT_POSTSYMLINK)) {
+                (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
+                                        dir_vp, DM_RIGHT_NULL,
+                                        error ? NULL : XFS_ITOV(ip),
+                                        DM_RIGHT_NULL, link_name, target_path,
+                                        0, error, 0);
+        }
+        if (!error) {
+                vnode_t *vp;
+                ASSERT(ip);
+                vp = XFS_ITOV(ip);
+                *vpp = vp;
+        }
+        return error;
+ error2:
+        IRELE(ip);
+ error1:
+        xfs_bmap_cancel(&free_list);
+        cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+        xfs_trans_cancel(tp, cancel_flags);
+        XFS_QM_DQRELE(mp, udqp);
+        XFS_QM_DQRELE(mp, gdqp);
+        if (!dp_joined_to_trans && (dp != NULL)) {
+                xfs_iunlock(dp, XFS_ILOCK_EXCL);
+        }
+        goto std_return;
+}
+/*
+ * xfs_fid2
+ *
+ * A fid routine that takes a pointer to a previously allocated
+ * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
+ */
+STATIC int
+xfs_fid2(
+        bhv_desc_t      *bdp,
+        fid_t           *fidp)
+{
+        xfs_inode_t     *ip;
+        xfs_fid2_t      *xfid;
+        vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
+                                       (inst_t *)__return_address);
+        ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
+        xfid = (xfs_fid2_t *)fidp;
+        ip = XFS_BHVTOI(bdp);
+        xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
+        xfid->fid_pad = 0;
+        /*
+         * use memcpy because the inode is a long long and there's no
+         * assurance that xfid->fid_ino is properly aligned.
+         */
+        memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
+        xfid->fid_gen = ip->i_d.di_gen;
+        return 0;
+}
+/*
+ * xfs_rwlock
+ */
+int
+xfs_rwlock(
+        bhv_desc_t      *bdp,
+        vrwlock_t       locktype)
+{
+        xfs_inode_t     *ip;
+        vnode_t         *vp;
+        vp = BHV_TO_VNODE(bdp);
+        if (vp->v_type == VDIR)
+                return 1;
+        ip = XFS_BHVTOI(bdp);
+        if (locktype == VRWLOCK_WRITE) {
+                xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        } else if (locktype == VRWLOCK_TRY_READ) {
+                return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED));
+        } else if (locktype == VRWLOCK_TRY_WRITE) {
+                return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL));
+        } else {
+                ASSERT((locktype == VRWLOCK_READ) ||
+                       (locktype == VRWLOCK_WRITE_DIRECT));
+                xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        }
+        return 1;
+}
+/*
+ * xfs_rwunlock
+ */
+void
+xfs_rwunlock(
+        bhv_desc_t      *bdp,
+        vrwlock_t       locktype)
+{
+        xfs_inode_t     *ip;
+        vnode_t         *vp;
+        vp = BHV_TO_VNODE(bdp);
+        if (vp->v_type == VDIR)
+                return;
+        ip = XFS_BHVTOI(bdp);
+        if (locktype == VRWLOCK_WRITE) {
+                /*
+                 * In the write case, we may have added a new entry to
+                 * the reference cache.  This might store a pointer to
+                 * an inode to be released in this inode.  If it is there,
+                 * clear the pointer and release the inode after unlocking
+                 * this one.
+                 */
+                xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
+        } else {
+                ASSERT((locktype == VRWLOCK_READ) ||
+                       (locktype == VRWLOCK_WRITE_DIRECT));
+                xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        }
+        return;
+}
+STATIC int
+xfs_inode_flush(
+        bhv_desc_t      *bdp,
+        int             flags)
+{
+        xfs_inode_t     *ip;
+        xfs_mount_t     *mp;
+        xfs_inode_log_item_t *iip;
+        int             error = 0;
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        iip = ip->i_itemp;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        /*
+         * Bypass inodes which have already been cleaned by
+         * the inode flush clustering code inside xfs_iflush
+         */
+        if ((ip->i_update_core == 0) &&
+            ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
+                return 0;
+        if (flags & FLUSH_LOG) {
+                if (iip && iip->ili_last_lsn) {
+                        xlog_t          *log = mp->m_log;
+                        xfs_lsn_t       sync_lsn;
+                        int             s, log_flags = XFS_LOG_FORCE;
+                        s = GRANT_LOCK(log);
+                        sync_lsn = log->l_last_sync_lsn;
+                        GRANT_UNLOCK(log, s);
+                        if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
+                                return 0;
+                        if (flags & FLUSH_SYNC)
+                                log_flags |= XFS_LOG_SYNC;
+                        return xfs_log_force(mp, iip->ili_last_lsn, log_flags);
+                }
+        }
+        /*
+         * We make this non-blocking if the inode is contended,
+         * return EAGAIN to indicate to the caller that they
+         * did not succeed. This prevents the flush path from
+         * blocking on inodes inside another operation right
+         * now, they get caught later by xfs_sync.
+         */
+        if (flags & FLUSH_INODE) {
+                int     flush_flags;
+                if (xfs_ipincount(ip))
+                        return EAGAIN;
+                if (flags & FLUSH_SYNC) {
+                        xfs_ilock(ip, XFS_ILOCK_SHARED);
+                        xfs_iflock(ip);
+                } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+                        if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
+                                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                                return EAGAIN;
+                        }
+                } else {
+                        return EAGAIN;
+                }
+                if (flags & FLUSH_SYNC)
+                        flush_flags = XFS_IFLUSH_SYNC;
+                else
+                        flush_flags = XFS_IFLUSH_ASYNC;
+                error = xfs_iflush(ip, flush_flags);
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        }
+        return error;
+}
+int
+xfs_set_dmattrs (
+        bhv_desc_t      *bdp,
+        u_int           evmask,
+        u_int16_t       state,
+        cred_t          *credp)
+{
+        xfs_inode_t     *ip;
+        xfs_trans_t     *tp;
+        xfs_mount_t     *mp;
+        int             error;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
+        ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        IHOLD(ip);
+        error = xfs_trans_commit(tp, 0, NULL);
+        return error;
+}
+/*
+ * xfs_reclaim
+ */
+STATIC int
+xfs_reclaim(
+        bhv_desc_t      *bdp)
+{
+        xfs_inode_t     *ip;
+        vnode_t         *vp;
+        vp = BHV_TO_VNODE(bdp);
+        ip = XFS_BHVTOI(bdp);
+        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        ASSERT(!VN_MAPPED(vp));
+        /* bad inode, get out here ASAP */
+        if (VN_BAD(vp)) {
+                xfs_ireclaim(ip);
+                return 0;
+        }
+        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+                if (ip->i_d.di_size > 0) {
+                        /*
+                         * Flush and invalidate any data left around that is
+                         * a part of this file.
+                         *
+                         * Get the inode's i/o lock so that buffers are pushed
+                         * out while holding the proper lock.  We can't hold
+                         * the inode lock here since flushing out buffers may
+                         * cause us to try to get the lock in xfs_strategy().
+                         *
+                         * We don't have to call remapf() here, because there
+                         * cannot be any mapped file references to this vnode
+                         * since it is being reclaimed.
+                         */
+                        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+                        /*
+                         * If we hit an IO error, we need to make sure that the
+                         * buffer and page caches of file data for
+                         * the file are tossed away. We don't want to use
+                         * VOP_FLUSHINVAL_PAGES here because we don't want dirty
+                         * pages to stay attached to the vnode, but be
+                         * marked P_BAD. pdflush/vnode_pagebad
+                         * hates that.
+                         */
+                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                                VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
+                        } else {
+                                VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
+                        }
+                        ASSERT(VN_CACHED(vp) == 0);
+                        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+                               ip->i_delayed_blks == 0);
+                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                } else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                        /*
+                         * di_size field may not be quite accurate if we're
+                         * shutting down.
+                         */
+                        VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
+                        ASSERT(VN_CACHED(vp) == 0);
+                }
+        }
+        /* If we have nothing to flush with this inode then complete the
+         * teardown now, otherwise break the link between the xfs inode
+         * and the linux inode and clean up the xfs inode later. This
+         * avoids flushing the inode to disk during the delete operation
+         * itself.
+         */
+        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_iflock(ip);
+                return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+        } else {
+                xfs_mount_t     *mp = ip->i_mount;
+                /* Protect sync from us */
+                XFS_MOUNT_ILOCK(mp);
+                vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
+                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
+                ip->i_flags |= XFS_IRECLAIMABLE;
+                XFS_MOUNT_IUNLOCK(mp);
+        }
+        return 0;
+}
+int
+xfs_finish_reclaim(
+        xfs_inode_t     *ip,
+        int             locked,
+        int             sync_mode)
+{
+        xfs_ihash_t     *ih = ip->i_hash;
+        vnode_t         *vp = XFS_ITOV_NULL(ip);
+        int             error;
+        if (vp && VN_BAD(vp))
+                goto reclaim;
+        /* The hash lock here protects a thread in xfs_iget_core from
+         * racing with us on linking the inode back with a vnode.
+         * Once we have the XFS_IRECLAIM flag set it will not touch
+         * us.
+         */
+        write_lock(&ih->ih_lock);
+        if ((ip->i_flags & XFS_IRECLAIM) ||
+            (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
+                write_unlock(&ih->ih_lock);
+                if (locked) {
+                        xfs_ifunlock(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+                return(1);
+        }
+        ip->i_flags |= XFS_IRECLAIM;
+        write_unlock(&ih->ih_lock);
+        /*
+         * If the inode is still dirty, then flush it out.  If the inode
+         * is not in the AIL, then it will be OK to flush it delwri as
+         * long as xfs_iflush() does not keep any references to the inode.
+         * We leave that decision up to xfs_iflush() since it has the
+         * knowledge of whether it's OK to simply do a delwri flush of
+         * the inode or whether we need to wait until the inode is
+         * pulled from the AIL.
+         * We get the flush lock regardless, though, just to make sure
+         * we don't free it while it is being flushed.
+         */
+        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                if (!locked) {
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
+                        xfs_iflock(ip);
+                }
+                if (ip->i_update_core ||
+                    ((ip->i_itemp != NULL) &&
+                     (ip->i_itemp->ili_format.ilf_fields != 0))) {
+                        error = xfs_iflush(ip, sync_mode);
+                        /*
+                         * If we hit an error, typically because of filesystem
+                         * shutdown, we don't need to let vn_reclaim to know
+                         * because we're gonna reclaim the inode anyway.
+                         */
+                        if (error) {
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                goto reclaim;
+                        }
+                        xfs_iflock(ip); /* synchronize with xfs_iflush_done */
+                }
+                ASSERT(ip->i_update_core == 0);
+                ASSERT(ip->i_itemp == NULL ||
+                       ip->i_itemp->ili_format.ilf_fields == 0);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        } else if (locked) {
+                /*
+                 * We are not interested in doing an iflush if we're
+                 * in the process of shutting down the filesystem forcibly.
+                 * So, just reclaim the inode.
+                 */
+                xfs_ifunlock(ip);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+ reclaim:
+        xfs_ireclaim(ip);
+        return 0;
+}
+int
+xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
+{
+        int             purged;
+        xfs_inode_t     *ip, *n;
+        int             done = 0;
+        while (!done) {
+                purged = 0;
+                XFS_MOUNT_ILOCK(mp);
+                list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
+                        if (noblock) {
+                                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
+                                        continue;
+                                if (xfs_ipincount(ip) ||
+                                    !xfs_iflock_nowait(ip)) {
+                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                        continue;
+                                }
+                        }
+                        XFS_MOUNT_IUNLOCK(mp);
+                        xfs_finish_reclaim(ip, noblock,
+                                XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                        purged = 1;
+                        break;
+                }
+                done = !purged;
+        }
+        XFS_MOUNT_IUNLOCK(mp);
+        return 0;
+}
+/*
+ * xfs_alloc_file_space()
+ *      This routine allocates disk space for the given file.
+ *
+ *      If alloc_type == 0, this request is for an ALLOCSP type
+ *      request which will change the file size.  In this case, no
+ *      DMAPI event will be generated by the call.  A TRUNCATE event
+ *      will be generated later by xfs_setattr.
+ *
+ *      If alloc_type != 0, this request is for a RESVSP type
+ *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
+ *      lower block boundary byte address is less than the file's
+ *      length.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_alloc_file_space(
+        xfs_inode_t             *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len,
+        int                     alloc_type,
+        int                     attr_flags)
+{
+        xfs_filblks_t           allocated_fsb;
+        xfs_filblks_t           allocatesize_fsb;
+        int                     committed;
+        xfs_off_t               count;
+        xfs_filblks_t           datablocks;
+        int                     error;
+        xfs_fsblock_t           firstfsb;
+        xfs_bmap_free_t         free_list;
+        xfs_bmbt_irec_t         *imapp;
+        xfs_bmbt_irec_t         imaps[1];
+        xfs_mount_t             *mp;
+        int                     numrtextents;
+        int                     reccount;
+        uint                    resblks;
+        int                     rt;
+        int                     rtextsize;
+        xfs_fileoff_t           startoffset_fsb;
+        xfs_trans_t             *tp;
+        int                     xfs_bmapi_flags;
+        vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
+        mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        /*
+         * determine if this is a realtime file
+         */
+        if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) {
+                if (ip->i_d.di_extsize)
+                        rtextsize = ip->i_d.di_extsize;
+                else
+                        rtextsize = mp->m_sb.sb_rextsize;
+        } else
+                rtextsize = 0;
+        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+                return error;
+        if (len <= 0)
+                return XFS_ERROR(EINVAL);
+        count = len;
+        error = 0;
+        imapp = &imaps[0];
+        reccount = 1;
+        xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+        startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+        allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+        /*      Generate a DMAPI event if needed.       */
+        if (alloc_type != 0 && offset < ip->i_d.di_size &&
+                        (attr_flags&ATTR_DMI) == 0  &&
+                        DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
+                xfs_off_t           end_dmi_offset;
+                end_dmi_offset = offset+len;
+                if (end_dmi_offset > ip->i_d.di_size)
+                        end_dmi_offset = ip->i_d.di_size;
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
+                        offset, end_dmi_offset - offset,
+                        0, NULL);
+                if (error)
+                        return(error);
+        }
+        /*
+         * allocate file space until done or until there is an error
+         */
+retry:
+        while (allocatesize_fsb && !error) {
+                /*
+                 * determine if reserving space on
+                 * the data or realtime partition.
+                 */
+                if (rt) {
+                        xfs_fileoff_t s, e;
+                        s = startoffset_fsb;
+                        do_div(s, rtextsize);
+                        s *= rtextsize;
+                        e = roundup_64(startoffset_fsb + allocatesize_fsb,
+                                rtextsize);
+                        numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
+                        datablocks = 0;
+                } else {
+                        datablocks = allocatesize_fsb;
+                        numrtextents = 0;
+                }
+                /*
+                 * allocate and setup the transaction
+                 */
+                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+                resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+                error = xfs_trans_reserve(tp,
+                                          resblks,
+                                          XFS_WRITE_LOG_RES(mp),
+                                          numrtextents,
+                                          XFS_TRANS_PERM_LOG_RES,
+                                          XFS_WRITE_LOG_COUNT);
+                /*
+                 * check for running out of space
+                 */
+                if (error) {
+                        /*
+                         * Free the transaction structure.
+                         */
+                        ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                        xfs_trans_cancel(tp, 0);
+                        break;
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp,
+                                ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
+                                XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+                if (error)
+                        goto error1;
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                /*
+                 * issue the bmapi() call to allocate the blocks
+                 */
+                XFS_BMAP_INIT(&free_list, &firstfsb);
+                error = xfs_bmapi(tp, ip, startoffset_fsb,
+                                  allocatesize_fsb, xfs_bmapi_flags,
+                                  &firstfsb, 0, imapp, &reccount,
+                                  &free_list);
+                if (error) {
+                        goto error0;
+                }
+                /*
+                 * complete the transaction
+                 */
+                error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+                if (error) {
+                        goto error0;
+                }
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                if (error) {
+                        break;
+                }
+                allocated_fsb = imapp->br_blockcount;
+                if (reccount == 0) {
+                        error = XFS_ERROR(ENOSPC);
+                        break;
+                }
+                startoffset_fsb += allocated_fsb;
+                allocatesize_fsb -= allocated_fsb;
+        }
+dmapi_enospc_check:
+        if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
+            DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
+                error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
+                                XFS_ITOV(ip), DM_RIGHT_NULL,
+                                XFS_ITOV(ip), DM_RIGHT_NULL,
+                                NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
+                if (error == 0)
+                        goto retry;     /* Maybe DMAPI app. has made space */
+                /* else fall through with error from XFS_SEND_DATA */
+        }
+        return error;
+ error0:
+        xfs_bmap_cancel(&free_list);
+ error1:
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        goto dmapi_enospc_check;
+}
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+        xfs_inode_t             *ip,
+        xfs_off_t               startoff,
+        xfs_off_t               endoff)
+{
+        xfs_bmbt_irec_t         imap;
+        xfs_fileoff_t           offset_fsb;
+        xfs_off_t               lastoffset;
+        xfs_off_t               offset;
+        xfs_buf_t               *bp;
+        xfs_mount_t             *mp = ip->i_mount;
+        int                     nimap;
+        int                     error = 0;
+        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
+                                ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
+                                mp->m_rtdev_targp : mp->m_ddev_targp);
+        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+                offset_fsb = XFS_B_TO_FSBT(mp, offset);
+                nimap = 1;
+                error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
+                        &nimap, NULL);
+                if (error || nimap < 1)
+                        break;
+                ASSERT(imap.br_blockcount >= 1);
+                ASSERT(imap.br_startoff == offset_fsb);
+                lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+                if (lastoffset > endoff)
+                        lastoffset = endoff;
+                if (imap.br_startblock == HOLESTARTBLOCK)
+                        continue;
+                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                if (imap.br_state == XFS_EXT_UNWRITTEN)
+                        continue;
+                XFS_BUF_UNDONE(bp);
+                XFS_BUF_UNWRITE(bp);
+                XFS_BUF_READ(bp);
+                XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
+                xfsbdstrat(mp, bp);
+                if ((error = xfs_iowait(bp))) {
+                        xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
+                                          mp, bp, XFS_BUF_ADDR(bp));
+                        break;
+                }
+                memset(XFS_BUF_PTR(bp) +
+                        (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+                      0, lastoffset - offset + 1);
+                XFS_BUF_UNDONE(bp);
+                XFS_BUF_UNREAD(bp);
+                XFS_BUF_WRITE(bp);
+                xfsbdstrat(mp, bp);
+                if ((error = xfs_iowait(bp))) {
+                        xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
+                                          mp, bp, XFS_BUF_ADDR(bp));
+                        break;
+                }
+        }
+        xfs_buf_free(bp);
+        return error;
+}
+/*
+ * xfs_free_file_space()
+ *      This routine frees disk space for the given file.
+ *
+ *      This routine is only called by xfs_change_file_space
+ *      for an UNRESVSP type call.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+STATIC int
+xfs_free_file_space(
+        xfs_inode_t             *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len,
+        int                     attr_flags)
+{
+        int                     committed;
+        int                     done;
+        xfs_off_t               end_dmi_offset;
+        xfs_fileoff_t           endoffset_fsb;
+        int                     error;
+        xfs_fsblock_t           firstfsb;
+        xfs_bmap_free_t         free_list;
+        xfs_off_t               ilen;
+        xfs_bmbt_irec_t         imap;
+        xfs_off_t               ioffset;
+        xfs_extlen_t            mod=0;
+        xfs_mount_t             *mp;
+        int                     nimap;
+        uint                    resblks;
+        int                     rounding;
+        int                     rt;
+        xfs_fileoff_t           startoffset_fsb;
+        xfs_trans_t             *tp;
+        int                     need_iolock = (attr_flags & ATTR_DMI) == 0;
+        vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
+        mp = ip->i_mount;
+        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+                return error;
+        error = 0;
+        if (len <= 0)   /* if nothing being freed */
+                return error;
+        rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
+        startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+        end_dmi_offset = offset + len;
+        endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
+        if (offset < ip->i_d.di_size &&
+            (attr_flags & ATTR_DMI) == 0 &&
+            DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
+                if (end_dmi_offset > ip->i_d.di_size)
+                        end_dmi_offset = ip->i_d.di_size;
+                error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
+                                offset, end_dmi_offset - offset,
+                                AT_DELAY_FLAG(attr_flags), NULL);
+                if (error)
+                        return(error);
+        }
+        if (need_iolock)
+                xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
+                        (__uint8_t)NBPP);
+        ilen = len + (offset & (rounding - 1));
+        ioffset = offset & ~(rounding - 1);
+        if (ilen & (rounding - 1))
+                ilen = (ilen + rounding) & ~(rounding - 1);
+        xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0);
+        /*
+         * Need to zero the stuff we're not freeing, on disk.
+         * If its a realtime file & can't use unwritten extents then we
+         * actually need to zero the extent edges.  Otherwise xfs_bunmapi
+         * will take care of it for us.
+         */
+        if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+                nimap = 1;
+                error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
+                        &imap, &nimap, NULL);
+                if (error)
+                        goto out_unlock_iolock;
+                ASSERT(nimap == 0 || nimap == 1);
+                if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+                        xfs_daddr_t     block;
+                        ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                        block = imap.br_startblock;
+                        mod = do_div(block, mp->m_sb.sb_rextsize);
+                        if (mod)
+                                startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+                }
+                nimap = 1;
+                error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
+                        &imap, &nimap, NULL);
+                if (error)
+                        goto out_unlock_iolock;
+                ASSERT(nimap == 0 || nimap == 1);
+                if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+                        ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                        mod++;
+                        if (mod && (mod != mp->m_sb.sb_rextsize))
+                                endoffset_fsb -= mod;
+                }
+        }
+        if ((done = (endoffset_fsb <= startoffset_fsb)))
+                /*
+                 * One contiguous piece to clear
+                 */
+                error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+        else {
+                /*
+                 * Some full blocks, possibly two pieces to clear
+                 */
+                if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+                        error = xfs_zero_remaining_bytes(ip, offset,
+                                XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+                if (!error &&
+                    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+                        error = xfs_zero_remaining_bytes(ip,
+                                XFS_FSB_TO_B(mp, endoffset_fsb),
+                                offset + len - 1);
+        }
+        /*
+         * free file space until done or until there is an error
+         */
+        resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+        while (!error && !done) {
+                /*
+                 * allocate and setup the transaction
+                 */
+                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+                error = xfs_trans_reserve(tp,
+                                          resblks,
+                                          XFS_WRITE_LOG_RES(mp),
+                                          0,
+                                          XFS_TRANS_PERM_LOG_RES,
+                                          XFS_WRITE_LOG_COUNT);
+                /*
+                 * check for running out of space
+                 */
+                if (error) {
+                        /*
+                         * Free the transaction structure.
+                         */
+                        ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                        xfs_trans_cancel(tp, 0);
+                        break;
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
+                                ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
+                                XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+                if (error)
+                        goto error1;
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+                xfs_trans_ihold(tp, ip);
+                /*
+                 * issue the bunmapi() call to free the blocks
+                 */
+                XFS_BMAP_INIT(&free_list, &firstfsb);
+                error = xfs_bunmapi(tp, ip, startoffset_fsb,
+                                  endoffset_fsb - startoffset_fsb,
+                                  0, 2, &firstfsb, &free_list, &done);
+                if (error) {
+                        goto error0;
+                }
+                /*
+                 * complete the transaction
+                 */
+                error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+                if (error) {
+                        goto error0;
+                }
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+ out_unlock_iolock:
+        if (need_iolock)
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
+ error0:
+        xfs_bmap_cancel(&free_list);
+ error1:
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+        xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
+                    XFS_ILOCK_EXCL);
+        return error;
+}
+/*
+ * xfs_change_file_space()
+ *      This routine allocates or frees disk space for the given file.
+ *      The user specified parameters are checked for alignment and size
+ *      limitations.
+ *
+ * RETURNS:
+ *       0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_change_file_space(
+        bhv_desc_t      *bdp,
+        int             cmd,
+        xfs_flock64_t   *bf,
+        xfs_off_t       offset,
+        cred_t          *credp,
+        int             attr_flags)
+{
+        int             clrprealloc;
+        int             error;
+        xfs_fsize_t     fsize;
+        xfs_inode_t     *ip;
+        xfs_mount_t     *mp;
+        int             setprealloc;
+        xfs_off_t       startoffset;
+        xfs_off_t       llen;
+        xfs_trans_t     *tp;
+        vattr_t         va;
+        vnode_t         *vp;
+        vp = BHV_TO_VNODE(bdp);
+        vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        ip = XFS_BHVTOI(bdp);
+        mp = ip->i_mount;
+        /*
+         * must be a regular file and have write permission
+         */
+        if (vp->v_type != VREG)
+                return XFS_ERROR(EINVAL);
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
+                xfs_iunlock(ip, XFS_ILOCK_SHARED);
+                return error;
+        }
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        switch (bf->l_whence) {
+        case 0: /*SEEK_SET*/
+                break;
+        case 1: /*SEEK_CUR*/
+                bf->l_start += offset;
+                break;
+        case 2: /*SEEK_END*/
+                bf->l_start += ip->i_d.di_size;
+                break;
+        default:
+                return XFS_ERROR(EINVAL);
+        }
+        llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+        if (   (bf->l_start < 0)
+            || (bf->l_start > XFS_MAXIOFFSET(mp))
+            || (bf->l_start + llen < 0)
+            || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
+                return XFS_ERROR(EINVAL);
+        bf->l_whence = 0;
+        startoffset = bf->l_start;
+        fsize = ip->i_d.di_size;
+        /*
+         * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
+         * file space.
+         * These calls do NOT zero the data space allocated to the file,
+         * nor do they change the file size.
+         *
+         * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
+         * space.
+         * These calls cause the new file data to be zeroed and the file
+         * size to be changed.
+         */
+        setprealloc = clrprealloc = 0;
+        switch (cmd) {
+        case XFS_IOC_RESVSP:
+        case XFS_IOC_RESVSP64:
+                error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
+                                                                1, attr_flags);
+                if (error)
+                        return error;
+                setprealloc = 1;
+                break;
+        case XFS_IOC_UNRESVSP:
+        case XFS_IOC_UNRESVSP64:
+                if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
+                                                                attr_flags)))
+                        return error;
+                break;
+        case XFS_IOC_ALLOCSP:
+        case XFS_IOC_ALLOCSP64:
+        case XFS_IOC_FREESP:
+        case XFS_IOC_FREESP64:
+                if (startoffset > fsize) {
+                        error = xfs_alloc_file_space(ip, fsize,
+                                        startoffset - fsize, 0, attr_flags);
+                        if (error)
+                                break;
+                }
+                va.va_mask = XFS_AT_SIZE;
+                va.va_size = startoffset;
+                error = xfs_setattr(bdp, &va, attr_flags, credp);
+                if (error)
+                        return error;
+                clrprealloc = 1;
+                break;
+        default:
+                ASSERT(0);
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * update the inode timestamp, mode, and prealloc flag bits
+         */
+        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+        if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
+                                      0, 0, 0))) {
+                /* ASSERT(0); */
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        if ((attr_flags & ATTR_DMI) == 0) {
+                ip->i_d.di_mode &= ~S_ISUID;
+                /*
+                 * Note that we don't have to worry about mandatory
+                 * file locking being disabled here because we only
+                 * clear the S_ISGID bit if the Group execute bit is
+                 * on, but if it was on then mandatory locking wouldn't
+                 * have been enabled.
+                 */
+                if (ip->i_d.di_mode & S_IXGRP)
+                        ip->i_d.di_mode &= ~S_ISGID;
+                xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        }
+        if (setprealloc)
+                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+        else if (clrprealloc)
+                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0, NULL);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return error;
+}
+vnodeops_t xfs_vnodeops = {
+        BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
+        .vop_open               = xfs_open,
+        .vop_read               = xfs_read,
+#ifdef HAVE_SENDFILE
+        .vop_sendfile           = xfs_sendfile,
+#endif
+        .vop_write              = xfs_write,
+        .vop_ioctl              = xfs_ioctl,
+        .vop_getattr            = xfs_getattr,
+        .vop_setattr            = xfs_setattr,
+        .vop_access             = xfs_access,
+        .vop_lookup             = xfs_lookup,
+        .vop_create             = xfs_create,
+        .vop_remove             = xfs_remove,
+        .vop_link               = xfs_link,
+        .vop_rename             = xfs_rename,
+        .vop_mkdir              = xfs_mkdir,
+        .vop_rmdir              = xfs_rmdir,
+        .vop_readdir            = xfs_readdir,
+        .vop_symlink            = xfs_symlink,
+        .vop_readlink           = xfs_readlink,
+        .vop_fsync              = xfs_fsync,
+        .vop_inactive           = xfs_inactive,
+        .vop_fid2               = xfs_fid2,
+        .vop_rwlock             = xfs_rwlock,
+        .vop_rwunlock           = xfs_rwunlock,
+        .vop_bmap               = xfs_bmap,
+        .vop_reclaim            = xfs_reclaim,
+        .vop_attr_get           = xfs_attr_get,
+        .vop_attr_set           = xfs_attr_set,
+        .vop_attr_remove        = xfs_attr_remove,
+        .vop_attr_list          = xfs_attr_list,
+        .vop_link_removed       = (vop_link_removed_t)fs_noval,
+        .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
+        .vop_tosspages          = fs_tosspages,
+        .vop_flushinval_pages   = fs_flushinval_pages,
+        .vop_flush_pages        = fs_flush_pages,
+        .vop_release            = xfs_release,
+        .vop_iflush             = xfs_inode_flush,
+};
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /fs/xfs