Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

Conflicts: arch/arm/mach-pxa/corgi.c arch/arm/mach-pxa/poodle.c arch/arm/mach-pxa/spitz.c
author: David Woodhouse <David.Woodhouse@intel.com> 2009-01-05 04:50:33 -0500
committer: David Woodhouse <David.Woodhouse@intel.com> 2009-01-05 04:50:33 -0500
commit: 353816f43d1fb340ff2d9a911dd5d0799c09f6a5 (patch)
tree: 517290fd884d286fe2971137ac89f89e3567785a /fs
parent: 160bbab3000dafccbe43688e48208cecf4deb879 (diff)
parent: fe0bdec68b77020281dc814805edfe594ae89e0f (diff)
323 files changed, 15676 insertions, 16420 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 3031e3233dd..14d94420457 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -45,7 +45,7 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
        struct v9fs_dentry *dent;
        P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
-                                        fid->fid, dentry->d_iname);
+                                        fid->fid, dentry->d_name.name);
        dent = dentry->d_fsdata;
        if (!dent) {
@@ -79,7 +79,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
        struct p9_fid *fid, *ret;
        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
-                dentry->d_iname, dentry, uid, any);
+                dentry->d_name.name, dentry, uid, any);
        dent = (struct v9fs_dentry *) dentry->d_fsdata;
        ret = NULL;
        if (dent) {
@@ -120,7 +120,7 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
        switch (access) {
        case V9FS_ACCESS_SINGLE:
        case V9FS_ACCESS_USER:
-                uid = current->fsuid;
+                uid = current_fsuid();
                any = 0;
                break;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 24eb01087b6..332b5ff02fe 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -160,7 +160,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
                                v9ses->flags |= V9FS_ACCESS_ANY;
                        else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
-                                v9ses->uid = simple_strtol(s, &e, 10);
+                                v9ses->uid = simple_strtoul(s, &e, 10);
                                if (*e != '\0')
                                        v9ses->uid = ~0;
                        }
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f9534f18df0..06dcc7c4f23 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -52,7 +52,8 @@
 static int v9fs_dentry_delete(struct dentry *dentry)
 {
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                                                                        dentry);
        return 1;
 }
@@ -69,7 +70,8 @@ static int v9fs_dentry_delete(struct dentry *dentry)
 static int v9fs_cached_dentry_delete(struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                                                                        dentry);
        if(!inode)
                return 1;
@@ -88,7 +90,8 @@ void v9fs_dentry_release(struct dentry *dentry)
        struct v9fs_dentry *dent;
        struct p9_fid *temp, *current_fid;
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                                                                        dentry);
        dent = dentry->d_fsdata;
        if (dent) {
                list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8314d3f43b7..81f8bbf12f9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -215,8 +215,8 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        inode = new_inode(sb);
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = current->fsuid;
+                inode->i_uid = current_fsuid();
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
                inode->i_blocks = 0;
                inode->i_rdev = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -963,7 +963,8 @@ static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer,
        if (buflen > PATH_MAX)
                buflen = PATH_MAX;
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry);
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                                                                        dentry);
        retval = v9fs_readlink(dentry, link, buflen);
@@ -1022,7 +1023,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
        char *s = nd_get_link(nd);
-        P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, s);
+        P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
+                IS_ERR(s) ? "<error>" : s);
        if (!IS_ERR(s))
                __putname(s);
 }
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index d6cb1a0ca72..93212e40221 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -113,8 +113,8 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        struct v9fs_session_info *v9ses = NULL;
        struct p9_wstat *st = NULL;
        int mode = S_IRWXUGO | S_ISVTX;
-        uid_t uid = current->fsuid;
+        uid_t uid = current_fsuid();
-        gid_t gid = current->fsgid;
+        gid_t gid = current_fsgid();
        struct p9_fid *fid;
        int retval = 0;
diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca..ff0e8198020 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -270,44 +270,7 @@ config OCFS2_COMPAT_JBD
 endif # BLOCK
-config DNOTIFY
+source "fs/notify/Kconfig"
-        bool "Dnotify support"
-        default y
-        help
-          Dnotify is a directory-based per-fd file change notification system
-          that uses signals to communicate events to user-space.  There exist
-          superior alternatives, but some applications may still rely on
-          dnotify.
-          If unsure, say Y.
-config INOTIFY
-        bool "Inotify file change notification support"
-        default y
-        ---help---
-          Say Y here to enable inotify support.  Inotify is a file change
-          notification system and a replacement for dnotify.  Inotify fixes
-          numerous shortcomings in dnotify and introduces several new features
-          including multiple file events, one-shot support, and unmount
-          notification.
-          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
-config INOTIFY_USER
-        bool "Inotify support for userspace"
-        depends on INOTIFY
-        default y
-        ---help---
-          Say Y here to enable inotify support for userspace, including the
-          associated system calls.  Inotify allows monitoring of both files and
-          directories via a single open fd.  Events are read from the file
-          descriptor, which is also select()- and poll()-able.
-          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
 config QUOTA
        bool "Quota support"
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c..e6f423d1d22 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y +=	no-block.o
 endif
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
-obj-$(CONFIG_INOTIFY)           += inotify.o
+obj-y                           += notify/
-obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
 obj-$(CONFIG_ANON_INODES)       += anon_inodes.o
 obj-$(CONFIG_SIGNALFD)          += signalfd.o
@@ -57,8 +56,6 @@ obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)           += quota_v2.o
 obj-$(CONFIG_QUOTACTL)          += quota.o
-obj-$(CONFIG_DNOTIFY)           += dnotify.o
 obj-$(CONFIG_PROC_FS)           += proc/
 obj-y                           += partitions/
 obj-$(CONFIG_SYSFS)             += sysfs/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6..9246cb4aa01 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
        }
        index = pos >> PAGE_CACHE_SHIFT;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index a13b334a391..415d9c67ac1 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -293,8 +293,8 @@ affs_new_inode(struct inode *dir)
        mark_buffer_dirty_inode(bh, inode);
        affs_brelse(bh);
-        inode->i_uid     = current->fsuid;
+        inode->i_uid     = current_fsuid();
-        inode->i_gid     = current->fsgid;
+        inode->i_gid     = current_fsgid();
        inode->i_ino     = block;
        inode->i_nlink   = 1;
        inode->i_mtime   = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 8989c93193e..a19d64b582a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -163,8 +163,8 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s
        /* Fill in defaults */
-        *uid        = current->uid;
+        *uid        = current_uid();
-        *gid        = current->gid;
+        *gid        = current_gid();
        *reserved   = 2;
        *root       = -1;
        *blocksize  = -1;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 9f7d1ae7026..7578c1ab9e0 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -646,7 +646,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
        }
        /* display one cell per line on subsequent lines */
-        seq_printf(m, "%u.%u.%u.%u\n", NIPQUAD(addr->s_addr));
+        seq_printf(m, "%pI4\n", &addr->s_addr);
        return 0;
 }
@@ -737,7 +737,7 @@ static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
        }
        /* display one cell per line on subsequent lines */
-        sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(server->addr));
+        sprintf(ipaddr, "%pI4", &server->addr);
        seq_printf(m, "%3d %-15.15s %5d\n",
                   atomic_read(&server->usage), ipaddr, server->fs_state);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 28f2451419e..f4909951667 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -105,7 +105,7 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell,
 {
        struct afs_server *server, *candidate;
-        _enter("%p,"NIPQUAD_FMT, cell, NIPQUAD(addr->s_addr));
+        _enter("%p,%pI4", cell, &addr->s_addr);
        /* quick scan of the list to see if we already have the server */
        read_lock(&cell->servers_lock);
@@ -168,9 +168,8 @@ found_server:
 server_in_two_cells:
        write_unlock(&cell->servers_lock);
        kfree(candidate);
-        printk(KERN_NOTICE "kAFS:"
+        printk(KERN_NOTICE "kAFS: Server %pI4 appears to be in two cells\n",
-               " Server "NIPQUAD_FMT" appears to be in two cells\n",
+               addr);
-               NIPQUAD(*addr));
        _leave(" = -EEXIST");
        return ERR_PTR(-EEXIST);
 }
@@ -184,7 +183,7 @@ struct afs_server *afs_find_server(const struct in_addr *_addr)
        struct rb_node *p;
        struct in_addr addr = *_addr;
-        _enter(NIPQUAD_FMT, NIPQUAD(addr.s_addr));
+        _enter("%pI4", &addr.s_addr);
        read_lock(&afs_servers_lock);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35f..3fb36d43362 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        candidate->state = AFS_WBACK_PENDING;
        init_waitqueue_head(&candidate->waitq);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                kfree(candidate);
                return -ENOMEM;
diff --git a/fs/aio.c b/fs/aio.c
index f658441d566..d6f89d3c15e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
        kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
+static void ctx_rcu_free(struct rcu_head *head)
+{
+        struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+        unsigned nr_events = ctx->max_reqs;
+        kmem_cache_free(kioctx_cachep, ctx);
+        if (nr_events) {
+                spin_lock(&aio_nr_lock);
+                BUG_ON(aio_nr - nr_events > aio_nr);
+                aio_nr -= nr_events;
+                spin_unlock(&aio_nr_lock);
+        }
+}
 /* __put_ioctx
 *      Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 */
 static void __put_ioctx(struct kioctx *ctx)
 {
-        unsigned nr_events = ctx->max_reqs;
        BUG_ON(ctx->reqs_active);
        cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
        mmdrop(ctx->mm);
        ctx->mm = NULL;
        pr_debug("__put_ioctx: freeing %p\n", ctx);
-        kmem_cache_free(kioctx_cachep, ctx);
+        call_rcu(&ctx->rcu_head, ctx_rcu_free);
-        if (nr_events) {
-                spin_lock(&aio_nr_lock);
-                BUG_ON(aio_nr - nr_events > aio_nr);
-                aio_nr -= nr_events;
-                spin_unlock(&aio_nr_lock);
-        }
 }
 #define get_ioctx(kioctx) do {                                          \
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
        struct mm_struct *mm;
        struct kioctx *ctx;
+        int did_sync = 0;
        /* Prevent overflows */
        if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
                goto out_freectx;
        /* limit the number of system wide aios */
-        spin_lock(&aio_nr_lock);
+        do {
-        if (aio_nr + ctx->max_reqs > aio_max_nr ||
+                spin_lock_bh(&aio_nr_lock);
-            aio_nr + ctx->max_reqs < aio_nr)
+                if (aio_nr + nr_events > aio_max_nr ||
-                ctx->max_reqs = 0;
+                    aio_nr + nr_events < aio_nr)
-        else
+                        ctx->max_reqs = 0;
-                aio_nr += ctx->max_reqs;
+                else
-        spin_unlock(&aio_nr_lock);
+                        aio_nr += ctx->max_reqs;
+                spin_unlock_bh(&aio_nr_lock);
+                if (ctx->max_reqs || did_sync)
+                        break;
+                /* wait for rcu callbacks to have completed before giving up */
+                synchronize_rcu();
+                did_sync = 1;
+                ctx->max_reqs = nr_events;
+        } while (1);
        if (ctx->max_reqs == 0)
                goto out_cleanup;
        /* now link into global list. */
-        write_lock(&mm->ioctx_list_lock);
+        spin_lock(&mm->ioctx_lock);
-        ctx->next = mm->ioctx_list;
+        hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
-        mm->ioctx_list = ctx;
+        spin_unlock(&mm->ioctx_lock);
-        write_unlock(&mm->ioctx_list_lock);
        dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
                ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
 */
 void exit_aio(struct mm_struct *mm)
 {
-        struct kioctx *ctx = mm->ioctx_list;
+        struct kioctx *ctx;
-        mm->ioctx_list = NULL;
-        while (ctx) {
+        while (!hlist_empty(&mm->ioctx_list)) {
-                struct kioctx *next = ctx->next;
+                ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
-                ctx->next = NULL;
+                hlist_del_rcu(&ctx->list);
                aio_cancel_all(ctx);
                wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
                                atomic_read(&ctx->users), ctx->dead,
                                ctx->reqs_active);
                put_ioctx(ctx);
-                ctx = next;
        }
 }
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
-        struct kioctx *ioctx;
+        struct mm_struct *mm = current->mm;
-        struct mm_struct *mm;
+        struct kioctx *ctx = NULL;
+        struct hlist_node *n;
-        mm = current->mm;
+        rcu_read_lock();
-        read_lock(&mm->ioctx_list_lock);
-        for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
+        hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-                if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
+                if (ctx->user_id == ctx_id && !ctx->dead) {
-                        get_ioctx(ioctx);
+                        get_ioctx(ctx);
                        break;
                }
-        read_unlock(&mm->ioctx_list_lock);
+        }
-        return ioctx;
+        rcu_read_unlock();
+        return ctx;
 }
 /*
@@ -1215,19 +1232,14 @@ out:
 static void io_destroy(struct kioctx *ioctx)
 {
        struct mm_struct *mm = current->mm;
-        struct kioctx **tmp;
        int was_dead;
        /* delete the entry from the list is someone else hasn't already */
-        write_lock(&mm->ioctx_list_lock);
+        spin_lock(&mm->ioctx_lock);
        was_dead = ioctx->dead;
        ioctx->dead = 1;
-        for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
+        hlist_del_rcu(&ioctx->list);
-             tmp = &(*tmp)->next)
+        spin_unlock(&mm->ioctx_lock);
-                ;
-        if (*tmp)
-                *tmp = ioctx->next;
-        write_unlock(&mm->ioctx_list_lock);
        dprintk("aio_release(%p)\n", ioctx);
        if (likely(!was_dead))
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 3662dd44896..3bbdb9d0237 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
        if (IS_ERR(anon_inode_inode))
                return -ENODEV;
+        if (fops->owner && !try_module_get(fops->owner))
+                return -ENOENT;
        error = get_unused_fd_flags(flags);
        if (error < 0)
-                return error;
+                goto err_module;
        fd = error;
        /*
@@ -128,6 +131,8 @@ err_dput:
        dput(dentry);
 err_put_unused_fd:
        put_unused_fd(fd);
+err_module:
+        module_put(fops->owner);
        return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
@@ -154,8 +159,8 @@ static struct inode *anon_inode_mkinode(void)
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IRUSR | S_IWUSR;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = current->fsgid;
+        inode->i_gid = current_fsgid();
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        return inode;
 }
diff --git a/fs/attr.c b/fs/attr.c
index 7a83819f6ba..f4360192a93 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -29,13 +29,13 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
        /* Make sure a caller can chown. */
        if ((ia_valid & ATTR_UID) &&
-            (current->fsuid != inode->i_uid ||
+            (current_fsuid() != inode->i_uid ||
             attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
                goto error;
        /* Make sure caller can chgrp. */
        if ((ia_valid & ATTR_GID) &&
-            (current->fsuid != inode->i_uid ||
+            (current_fsuid() != inode->i_uid ||
            (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
            !capable(CAP_CHOWN))
                goto error;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index b70eea1e8c5..c773680d5c6 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -76,8 +76,8 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
        substring_t args[MAX_OPT_ARGS];
        int option;
-        *uid = current->uid;
+        *uid = current_uid();
-        *gid = current->gid;
+        *gid = current_gid();
        *pgrp = task_pgrp_nr(current);
        *minproto = *maxproto = AUTOFS_PROTO_VERSION;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 33bf8cbfd05..63b7c7afe8d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -308,7 +308,8 @@ static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
                        goto out;
                }
-                filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY);
+                filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+                                   current_cred());
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
                        goto out;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index c7e65bb30ba..7b19802cfef 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -235,8 +235,8 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
        substring_t args[MAX_OPT_ARGS];
        int option;
-        *uid = current->uid;
+        *uid = current_uid();
-        *gid = current->gid;
+        *gid = current_gid();
        *pgrp = task_pgrp_nr(current);
        *minproto = AUTOFS_MIN_PROTO_VERSION;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 4b67c2a2d77..e02cc8ae5eb 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -391,8 +391,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                memcpy(&wq->name, &qstr, sizeof(struct qstr));
                wq->dev = autofs4_get_dev(sbi);
                wq->ino = autofs4_get_ino(sbi);
-                wq->uid = current->uid;
+                wq->uid = current_uid();
-                wq->gid = current->gid;
+                wq->gid = current_gid();
                wq->pid = current->pid;
                wq->tgid = current->tgid;
                wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1..a05287a23f6 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
        return -EIO;
 }
-static int bad_file_dir_notify(struct file *file, unsigned long arg)
-{
-        return -EIO;
-}
 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
        return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
        .sendpage       = bad_file_sendpage,
        .get_unmapped_area = bad_file_get_unmapped_area,
        .check_flags    = bad_file_check_flags,
-        .dir_notify     = bad_file_dir_notify,
        .flock          = bad_file_flock,
        .splice_write   = bad_file_splice_write,
        .splice_read    = bad_file_splice_read,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee37c7b..d06cb023ad0 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                inode->i_size = 0;
                inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
                strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
-                        BEFS_SYMLINK_LEN);
+                        BEFS_SYMLINK_LEN - 1);
+                befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
        } else {
                int num_blks;
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
                        kfree(link);
                        befs_error(sb, "Failed to read entire long symlink");
                        link = ERR_PTR(-EIO);
+                } else {
+                        link[len - 1] = '\0';
                }
        } else {
                link = befs_ino->i_data.symlink;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index daae463068e..4dd1b623f93 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -106,8 +106,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
        }
        set_bit(ino, info->si_imap);
        info->si_freei--;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
+        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
        inode->i_op = &bfs_file_inops;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 204cfd1d767..b639dcf7c77 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -95,92 +95,55 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
        int has_dumped = 0;
        unsigned long dump_start, dump_size;
        struct user dump;
-#if defined(__alpha__)
+#ifdef __alpha__
 #       define START_DATA(u)    (u.start_data)
-#elif defined(__arm__)
+#else
 #       define START_DATA(u)    ((u.u_tsize << PAGE_SHIFT) + u.start_code)
-#elif defined(__sparc__)
-#       define START_DATA(u)    (u.u_tsize)
-#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
-#       define START_DATA(u)    (u.u_tsize << PAGE_SHIFT)
 #endif
-#ifdef __sparc__
-#       define START_STACK(u)   ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
-#else
 #       define START_STACK(u)   (u.start_stack)
-#endif
        fs = get_fs();
        set_fs(KERNEL_DS);
        has_dumped = 1;
        current->flags |= PF_DUMPCORE;
        strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
-#ifndef __sparc__
        dump.u_ar0 = offsetof(struct user, regs);
-#endif
        dump.signal = signr;
        aout_dump_thread(regs, &dump);
 /* If the size of the dump file exceeds the rlimit, then see what would happen
   if we wrote the stack, but not the data area.  */
-#ifdef __sparc__
-        if ((dump.u_dsize + dump.u_ssize) > limit)
-                dump.u_dsize = 0;
-#else
        if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
                dump.u_dsize = 0;
-#endif
 /* Make sure we have enough room to write the stack and data areas. */
-#ifdef __sparc__
-        if (dump.u_ssize > limit)
-                dump.u_ssize = 0;
-#else
        if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
                dump.u_ssize = 0;
-#endif
 /* make sure we actually have a data and stack area to dump */
        set_fs(USER_DS);
-#ifdef __sparc__
-        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize))
-                dump.u_dsize = 0;
-        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize))
-                dump.u_ssize = 0;
-#else
        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
                dump.u_dsize = 0;
        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
                dump.u_ssize = 0;
-#endif
        set_fs(KERNEL_DS);
 /* struct user */
        DUMP_WRITE(&dump,sizeof(dump));
 /* Now dump all of the user data.  Include malloced stuff as well */
-#ifndef __sparc__
        DUMP_SEEK(PAGE_SIZE);
-#endif
 /* now we start writing out the user space info */
        set_fs(USER_DS);
 /* Dump the data area */
        if (dump.u_dsize != 0) {
                dump_start = START_DATA(dump);
-#ifdef __sparc__
-                dump_size = dump.u_dsize;
-#else
                dump_size = dump.u_dsize << PAGE_SHIFT;
-#endif
                DUMP_WRITE(dump_start,dump_size);
        }
 /* Now prepare to dump the stack area */
        if (dump.u_ssize != 0) {
                dump_start = START_STACK(dump);
-#ifdef __sparc__
-                dump_size = dump.u_ssize;
-#else
                dump_size = dump.u_ssize << PAGE_SHIFT;
-#endif
                DUMP_WRITE(dump_start,dump_size);
        }
 /* Finally dump the task struct.  Not be used by gdb, but could be useful */
@@ -205,29 +168,24 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
        int envc = bprm->envc;
        sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __sparc__
-        /* This imposes the proper stack alignment for a new process. */
-        sp = (void __user *) (((unsigned long) sp) & ~7);
-        if ((envc+argc+3)&1) --sp;
-#endif
 #ifdef __alpha__
 /* whee.. test-programs are so much fun. */
        put_user(0, --sp);
        put_user(0, --sp);
        if (bprm->loader) {
                put_user(0, --sp);
-                put_user(0x3eb, --sp);
+                put_user(1003, --sp);
                put_user(bprm->loader, --sp);
-                put_user(0x3ea, --sp);
+                put_user(1002, --sp);
        }
        put_user(bprm->exec, --sp);
-        put_user(0x3e9, --sp);
+        put_user(1001, --sp);
 #endif
        sp -= envc+1;
        envp = (char __user * __user *) sp;
        sp -= argc+1;
        argv = (char __user * __user *) sp;
-#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__)
+#ifndef __alpha__
        put_user((unsigned long) envp,--sp);
        put_user((unsigned long) argv,--sp);
 #endif
@@ -300,13 +258,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                return retval;
        /* OK, This is the point of no return */
-#if defined(__alpha__)
+#ifdef __alpha__
        SET_AOUT_PERSONALITY(bprm, ex);
-#elif defined(__sparc__)
-        set_personality(PER_SUNOS);
-#if !defined(__sparc_v9__)
-        memcpy(&current->thread.core_exec, &ex, sizeof(struct exec));
-#endif
 #else
        set_personality(PER_LINUX);
 #endif
@@ -320,26 +273,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        current->mm->free_area_cache = current->mm->mmap_base;
        current->mm->cached_hole_size = 0;
-        compute_creds(bprm);
+        install_exec_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
-#ifdef __sparc__
-        if (N_MAGIC(ex) == NMAGIC) {
-                loff_t pos = fd_offset;
-                /* Fuck me plenty... */
-                /* <AOL></AOL> */
-                down_write(&current->mm->mmap_sem);     
-                error = do_brk(N_TXTADDR(ex), ex.a_text);
-                up_write(&current->mm->mmap_sem);
-                bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
-                          ex.a_text, &pos);
-                down_write(&current->mm->mmap_sem);
-                error = do_brk(N_DATADDR(ex), ex.a_data);
-                up_write(&current->mm->mmap_sem);
-                bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
-                          ex.a_data, &pos);
-                goto beyond_if;
-        }
-#endif
        if (N_MAGIC(ex) == OMAGIC) {
                unsigned long text_addr, map_size;
@@ -347,7 +282,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                text_addr = N_TXTADDR(ex);
-#if defined(__alpha__) || defined(__sparc__)
+#ifdef __alpha__
                pos = fd_offset;
                map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
 #else
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8fcfa398d35..c41fa2af767 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -157,7 +157,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        int items;
        elf_addr_t *elf_info;
        int ei_index = 0;
-        struct task_struct *tsk = current;
+        const struct cred *cred = current_cred();
        struct vm_area_struct *vma;
        /*
@@ -223,10 +223,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        NEW_AUX_ENT(AT_BASE, interp_load_addr);
        NEW_AUX_ENT(AT_FLAGS, 0);
        NEW_AUX_ENT(AT_ENTRY, exec->e_entry);
-        NEW_AUX_ENT(AT_UID, tsk->uid);
+        NEW_AUX_ENT(AT_UID, cred->uid);
-        NEW_AUX_ENT(AT_EUID, tsk->euid);
+        NEW_AUX_ENT(AT_EUID, cred->euid);
-        NEW_AUX_ENT(AT_GID, tsk->gid);
+        NEW_AUX_ENT(AT_GID, cred->gid);
-        NEW_AUX_ENT(AT_EGID, tsk->egid);
+        NEW_AUX_ENT(AT_EGID, cred->egid);
        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
        NEW_AUX_ENT(AT_EXECFN, bprm->exec);
        if (k_platform) {
@@ -949,14 +949,14 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        set_binfmt(&elf_format);
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-        retval = arch_setup_additional_pages(bprm, executable_stack);
+        retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
        if (retval < 0) {
                send_sig(SIGKILL, current, 0);
                goto out;
        }
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
-        compute_creds(bprm);
+        install_exec_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
        retval = create_elf_tables(bprm, &loc->elf_ex,
                          load_addr, interp_load_addr);
@@ -1361,6 +1361,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
                       struct mm_struct *mm)
 {
+        const struct cred *cred;
        unsigned int i, len;
        
        /* first copy the parameters from user space */
@@ -1388,8 +1389,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
        psinfo->pr_zomb = psinfo->pr_sname == 'Z';
        psinfo->pr_nice = task_nice(p);
        psinfo->pr_flag = p->flags;
-        SET_UID(psinfo->pr_uid, p->uid);
+        rcu_read_lock();
-        SET_GID(psinfo->pr_gid, p->gid);
+        cred = __task_cred(p);
+        SET_UID(psinfo->pr_uid, cred->uid);
+        SET_GID(psinfo->pr_gid, cred->gid);
+        rcu_read_unlock();
        strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
        
        return 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 5b5424cb339..aa5b43205e3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -404,7 +404,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        current->mm->start_stack = current->mm->start_brk + stack_size;
 #endif
-        compute_creds(bprm);
+        install_exec_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
        if (create_elf_fdpic_tables(bprm, current->mm,
                                    &exec_params, &interp_params) < 0)
@@ -475,6 +475,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
                                   struct elf_fdpic_params *exec_params,
                                   struct elf_fdpic_params *interp_params)
 {
+        const struct cred *cred = current_cred();
        unsigned long sp, csp, nitems;
        elf_caddr_t __user *argv, *envp;
        size_t platform_len = 0, len;
@@ -623,10 +624,10 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
        NEW_AUX_ENT(AT_BASE,    interp_params->elfhdr_addr);
        NEW_AUX_ENT(AT_FLAGS,   0);
        NEW_AUX_ENT(AT_ENTRY,   exec_params->entry_addr);
-        NEW_AUX_ENT(AT_UID,     (elf_addr_t) current->uid);
+        NEW_AUX_ENT(AT_UID,     (elf_addr_t) cred->uid);
-        NEW_AUX_ENT(AT_EUID,    (elf_addr_t) current->euid);
+        NEW_AUX_ENT(AT_EUID,    (elf_addr_t) cred->euid);
-        NEW_AUX_ENT(AT_GID,     (elf_addr_t) current->gid);
+        NEW_AUX_ENT(AT_GID,     (elf_addr_t) cred->gid);
-        NEW_AUX_ENT(AT_EGID,    (elf_addr_t) current->egid);
+        NEW_AUX_ENT(AT_EGID,    (elf_addr_t) cred->egid);
        NEW_AUX_ENT(AT_SECURE,  security_bprm_secureexec(bprm));
        NEW_AUX_ENT(AT_EXECFN,  bprm->exec);
@@ -1413,6 +1414,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
                       struct mm_struct *mm)
 {
+        const struct cred *cred;
        unsigned int i, len;
        /* first copy the parameters from user space */
@@ -1440,8 +1442,11 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
        psinfo->pr_zomb = psinfo->pr_sname == 'Z';
        psinfo->pr_nice = task_nice(p);
        psinfo->pr_flag = p->flags;
-        SET_UID(psinfo->pr_uid, p->uid);
+        rcu_read_lock();
-        SET_GID(psinfo->pr_gid, p->gid);
+        cred = __task_cred(p);
+        SET_UID(psinfo->pr_uid, cred->uid);
+        SET_GID(psinfo->pr_gid, cred->gid);
+        rcu_read_unlock();
        strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
        return 0;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index ccb781a6a80..7bbd5c6b372 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -880,7 +880,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                                        (libinfo.lib_list[j].loaded)?
                                                libinfo.lib_list[j].start_data:UNLOADED_LIB;
-        compute_creds(bprm);
+        install_exec_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
        set_binfmt(&flat_format);
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 74e587a5279..08644a61616 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -255,7 +255,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        kfree(hpuxhdr);
        set_binfmt(&som_format);
-        compute_creds(bprm);
+        install_exec_creds(bprm);
        setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
        create_som_tables(bprm);
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 19caf7c962a..77ebc3c263d 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
-        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+        bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
        mempool_free(bip, bs->bio_integrity_pool);
        bio->bi_integrity = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index 77a55bcceed..711cee10360 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,9 +26,16 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-static struct kmem_cache *bio_slab __read_mostly;
+DEFINE_TRACE(block_split);
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS         4
 static mempool_t *bio_split_pool __read_mostly;
@@ -37,9 +44,8 @@ static mempool_t *bio_split_pool __read_mostly;
 * break badly! cannot be bigger than what you can fit into an
 * unsigned short
 */
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
        BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -50,12 +56,121 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 */
 struct bio_set *fs_bio_set;
+/*
+ * Our slab pool management
+ */
+struct bio_slab {
+        struct kmem_cache *slab;
+        unsigned int slab_ref;
+        unsigned int slab_size;
+        char name[8];
+};
+static DEFINE_MUTEX(bio_slab_lock);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+{
+        unsigned int sz = sizeof(struct bio) + extra_size;
+        struct kmem_cache *slab = NULL;
+        struct bio_slab *bslab;
+        unsigned int i, entry = -1;
+        mutex_lock(&bio_slab_lock);
+        i = 0;
+        while (i < bio_slab_nr) {
+                struct bio_slab *bslab = &bio_slabs[i];
+                if (!bslab->slab && entry == -1)
+                        entry = i;
+                else if (bslab->slab_size == sz) {
+                        slab = bslab->slab;
+                        bslab->slab_ref++;
+                        break;
+                }
+                i++;
+        }
+        if (slab)
+                goto out_unlock;
+        if (bio_slab_nr == bio_slab_max && entry == -1) {
+                bio_slab_max <<= 1;
+                bio_slabs = krealloc(bio_slabs,
+                                     bio_slab_max * sizeof(struct bio_slab),
+                                     GFP_KERNEL);
+                if (!bio_slabs)
+                        goto out_unlock;
+        }
+        if (entry == -1)
+                entry = bio_slab_nr++;
+        bslab = &bio_slabs[entry];
+        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+        slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+        if (!slab)
+                goto out_unlock;
+        printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+        bslab->slab = slab;
+        bslab->slab_ref = 1;
+        bslab->slab_size = sz;
+out_unlock:
+        mutex_unlock(&bio_slab_lock);
+        return slab;
+}
+static void bio_put_slab(struct bio_set *bs)
+{
+        struct bio_slab *bslab = NULL;
+        unsigned int i;
+        mutex_lock(&bio_slab_lock);
+        for (i = 0; i < bio_slab_nr; i++) {
+                if (bs->bio_slab == bio_slabs[i].slab) {
+                        bslab = &bio_slabs[i];
+                        break;
+                }
+        }
+        if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
+                goto out;
+        WARN_ON(!bslab->slab_ref);
+        if (--bslab->slab_ref)
+                goto out;
+        kmem_cache_destroy(bslab->slab);
+        bslab->slab = NULL;
+out:
+        mutex_unlock(&bio_slab_lock);
+}
 unsigned int bvec_nr_vecs(unsigned short idx)
 {
        return bvec_slabs[idx].nr_vecs;
 }
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+{
+        BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
+        if (idx == BIOVEC_MAX_IDX)
+                mempool_free(bv, bs->bvec_pool);
+        else {
+                struct biovec_slab *bvs = bvec_slabs + idx;
+                kmem_cache_free(bvs->slab, bv);
+        }
+}
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
+                              struct bio_set *bs)
 {
        struct bio_vec *bvl;
@@ -64,60 +179,85 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
         * If not, this is a bio_kmalloc() allocation and just do a
         * kzalloc() for the exact number of vecs right away.
         */
-        if (bs) {
+        if (!bs)
+                bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
+        /*
+         * see comment near bvec_array define!
+         */
+        switch (nr) {
+        case 1:
+                *idx = 0;
+                break;
+        case 2 ... 4:
+                *idx = 1;
+                break;
+        case 5 ... 16:
+                *idx = 2;
+                break;
+        case 17 ... 64:
+                *idx = 3;
+                break;
+        case 65 ... 128:
+                *idx = 4;
+                break;
+        case 129 ... BIO_MAX_PAGES:
+                *idx = 5;
+                break;
+        default:
+                return NULL;
+        }
+        /*
+         * idx now points to the pool we want to allocate from. only the
+         * 1-vec entry pool is mempool backed.
+         */
+        if (*idx == BIOVEC_MAX_IDX) {
+fallback:
+                bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+        } else {
+                struct biovec_slab *bvs = bvec_slabs + *idx;
+                gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
                /*
-                 * see comment near bvec_array define!
+                 * Make this allocation restricted and don't dump info on
+                 * allocation failures, since we'll fallback to the mempool
+                 * in case of failure.
                 */
-                switch (nr) {
+                __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-                case 1:
-                        *idx = 0;
-                        break;
-                case 2 ... 4:
-                        *idx = 1;
-                        break;
-                case 5 ... 16:
-                        *idx = 2;
-                        break;
-                case 17 ... 64:
-                        *idx = 3;
-                        break;
-                case 65 ... 128:
-                        *idx = 4;
-                        break;
-                case 129 ... BIO_MAX_PAGES:
-                        *idx = 5;
-                        break;
-                default:
-                        return NULL;
-                }
                /*
-                 * idx now points to the pool we want to allocate from
+                 * Try a slab allocation. If this fails and __GFP_WAIT
+                 * is set, retry with the 1-entry mempool
                 */
-                bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+                bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
-                if (bvl)
+                if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
-                        memset(bvl, 0,
+                        *idx = BIOVEC_MAX_IDX;
-                                bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+                        goto fallback;
-        } else
+                }
-                bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+        }
        return bvl;
 }
-void bio_free(struct bio *bio, struct bio_set *bio_set)
+void bio_free(struct bio *bio, struct bio_set *bs)
 {
-        if (bio->bi_io_vec) {
+        void *p;
-                const int pool_idx = BIO_POOL_IDX(bio);
-                BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
+        if (bio_has_allocated_vec(bio))
+                bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
-                mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
-        }
        if (bio_integrity(bio))
-                bio_integrity_free(bio, bio_set);
+                bio_integrity_free(bio, bs);
+        /*
+         * If we have front padding, adjust the bio pointer before freeing
+         */
+        p = bio;
+        if (bs->front_pad)
+                p -= bs->front_pad;
-        mempool_free(bio, bio_set->bio_pool);
+        mempool_free(p, bs->bio_pool);
 }
 /*
@@ -130,7 +270,8 @@ static void bio_fs_destructor(struct bio *bio)
 static void bio_kmalloc_destructor(struct bio *bio)
 {
-        kfree(bio->bi_io_vec);
+        if (bio_has_allocated_vec(bio))
+                kfree(bio->bi_io_vec);
        kfree(bio);
 }
@@ -154,16 +295,20 @@ void bio_init(struct bio *bio)
 *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
 *   fall back to just using @kmalloc to allocate the required memory.
 *
- *   allocate bio and iovecs from the memory pools specified by the
+ *   Note that the caller must set ->bi_destructor on succesful return
- *   bio_set structure, or @kmalloc if none given.
+ *   of a bio, to do the appropriate freeing of the bio once the reference
+ *   count drops to zero.
 **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-        struct bio *bio;
+        struct bio *bio = NULL;
+        if (bs) {
+                void *p = mempool_alloc(bs->bio_pool, gfp_mask);
-        if (bs)
+                if (p)
-                bio = mempool_alloc(bs->bio_pool, gfp_mask);
+                        bio = p + bs->front_pad;
-        else
+        } else
                bio = kmalloc(sizeof(*bio), gfp_mask);
        if (likely(bio)) {
@@ -173,7 +318,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                if (likely(nr_iovecs)) {
                        unsigned long uninitialized_var(idx);
-                        bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+                        if (nr_iovecs <= BIO_INLINE_VECS) {
+                                idx = 0;
+                                bvl = bio->bi_inline_vecs;
+                                nr_iovecs = BIO_INLINE_VECS;
+                        } else {
+                                bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
+                                                        bs);
+                                nr_iovecs = bvec_nr_vecs(idx);
+                        }
                        if (unlikely(!bvl)) {
                                if (bs)
                                        mempool_free(bio, bs->bio_pool);
@@ -183,7 +336,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                                goto out;
                        }
                        bio->bi_flags |= idx << BIO_POOL_OFFSET;
-                        bio->bi_max_vecs = bvec_nr_vecs(idx);
+                        bio->bi_max_vecs = nr_iovecs;
                }
                bio->bi_io_vec = bvl;
        }
@@ -1263,7 +1416,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
        if (!bp)
                return bp;
-        blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+        trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                bi->bi_sector + first_sectors);
        BUG_ON(bi->bi_vcnt != 1);
@@ -1343,30 +1496,18 @@ EXPORT_SYMBOL(bio_sector_offset);
 */
 static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
-        int i;
+        struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
-        for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+        bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-                struct biovec_slab *bp = bvec_slabs + i;
+        if (!bs->bvec_pool)
-                mempool_t **bvp = bs->bvec_pools + i;
+                return -ENOMEM;
-                *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
-                if (!*bvp)
-                        return -ENOMEM;
-        }
        return 0;
 }
 static void biovec_free_pools(struct bio_set *bs)
 {
-        int i;
+        mempool_destroy(bs->bvec_pool);
-        for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-                mempool_t *bvp = bs->bvec_pools[i];
-                if (bvp)
-                        mempool_destroy(bvp);
-        }
 }
 void bioset_free(struct bio_set *bs)
@@ -1376,25 +1517,49 @@ void bioset_free(struct bio_set *bs)
        bioset_integrity_free(bs);
        biovec_free_pools(bs);
+        bio_put_slab(bs);
        kfree(bs);
 }
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
+/**
+ * bioset_create  - Create a bio_set
+ * @pool_size:  Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:  Number of bytes to allocate in front of the returned bio
+ *
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ */
+struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
-        struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
+        unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+        struct bio_set *bs;
+        bs = kzalloc(sizeof(*bs), GFP_KERNEL);
        if (!bs)
                return NULL;
-        bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
+        bs->front_pad = front_pad;
+        bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+        if (!bs->bio_slab) {
+                kfree(bs);
+                return NULL;
+        }
+        bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
        if (!bs->bio_pool)
                goto bad;
-        if (bioset_integrity_create(bs, bio_pool_size))
+        if (bioset_integrity_create(bs, pool_size))
                goto bad;
-        if (!biovec_create_pools(bs, bvec_pool_size))
+        if (!biovec_create_pools(bs, pool_size))
                return bs;
 bad:
@@ -1418,12 +1583,16 @@ static void __init biovec_init_slabs(void)
 static int __init init_bio(void)
 {
-        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        bio_slab_max = 2;
+        bio_slab_nr = 0;
+        bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
+        if (!bio_slabs)
+                panic("bio: can't allocate bios\n");
        bio_integrity_init_slab();
        biovec_init_slabs();
-        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
+        fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
        if (!fs_bio_set)
                panic("bio: can't allocate bios\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1a4c7..349a26c1000 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -326,12 +326,13 @@ static struct file_system_type bd_type = {
        .kill_sb        = kill_anon_super,
 };
-static struct vfsmount *bd_mnt __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
-struct super_block *blockdev_superblock;
 void __init bdev_cache_init(void)
 {
        int err;
+        struct vfsmount *bd_mnt;
        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +374,7 @@ struct block_device *bdget(dev_t dev)
        struct block_device *bdev;
        struct inode *inode;
-        inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),
+        inode = iget5_locked(blockdev_superblock, hash(dev),
                        bdev_test, bdev_set, &dev);
        if (!inode)
@@ -463,7 +464,7 @@ void bd_forget(struct inode *inode)
        spin_lock(&bdev_lock);
        if (inode->i_bdev) {
-                if (inode->i_sb != blockdev_superblock)
+                if (!sb_is_blkdev_sb(inode->i_sb))
                        bdev = inode->i_bdev;
                __bd_forget(inode);
        }
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cfa115..a13f09b696f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
        page_cache_release(page);
 }
+static int quiet_error(struct buffer_head *bh)
+{
+        if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+                return 0;
+        return 1;
+}
 static void buffer_io_error(struct buffer_head *bh)
 {
        char b[BDEVNAME_SIZE];
        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
                        bdevname(bh->b_bdev, b),
                        (unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -394,7 +402,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
-                if (printk_ratelimit())
+                if (!quiet_error(bh))
                        buffer_io_error(bh);
                SetPageError(page);
        }
@@ -455,7 +463,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (printk_ratelimit()) {
+                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -1988,7 +1996,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
        page = *pagep;
        if (page == NULL) {
                ownpage = 1;
-                page = __grab_cache_page(mapping, index);
+                page = grab_cache_page_write_begin(mapping, index, flags);
                if (!page) {
                        status = -ENOMEM;
                        goto out;
@@ -2494,7 +2502,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
@@ -2913,6 +2921,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
                set_bit(BH_Eopnotsupp, &bh->b_state);
        }
+        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+                set_bit(BH_Quiet, &bh->b_state);
        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
        bio_put(bio);
 }
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
index 9c136d7803d..7f7fa3c302a 100644
--- a/fs/cifs/AUTHORS
+++ b/fs/cifs/AUTHORS
@@ -36,7 +36,9 @@ Miklos Szeredi
 Kazeon team for various fixes especially for 2.4 version.
 Asser Ferno (Change Notify support)
 Shaggy (Dave Kleikamp) for inumerable small fs suggestions and some good cleanup
+Gunter Kukkukk (testing and suggestions for support of old servers)
 Igor Mammedov (DFS support)
+Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
 Test case and Bug Report contributors
 -------------------------------------
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e078b7aea14..080703a15f4 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
+Version 1.56
+------------
+Add "forcemandatorylock" mount option to allow user to use mandatory
+rather than posix (advisory) byte range locks, even though server would
+support posix byte range locks.  Fix query of root inode when prefixpath
+specified and user does not have access to query information about the
+top of the share.  Fix problem in 2.6.28 resolving DFS paths to
+Samba servers (worked to Windows).
 Version 1.55
 ------------
 Various fixes to make delete of open files behavior more predictable
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346f..9948c0030e8 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
          readdir.o ioctl.o sess.o export.o cifsacl.o
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/README b/fs/cifs/README
index a439dc1739b..da4515e3be2 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -463,9 +463,19 @@ A partial list of the supported mount options follows:
                with cifs style mandatory byte range locks (and most
                cifs servers do not yet support requesting advisory
                byte range locks).
+ forcemandatorylock Even if the server supports posix (advisory) byte range
+                locking, send only mandatory lock requests.  For some
+                (presumably rare) applications, originally coded for
+                DOS/Windows, which require Windows style mandatory byte range
+                locking, they may be able to take advantage of this option,
+                forcing the cifs client to only send mandatory locks
+                even if the cifs server would support posix advisory locks.
+                "forcemand" is accepted as a shorter form of this mount
+                option.
 nodfs          Disable DFS (global name space support) even if the
                server claims to support it.  This can help work around
-                a problem with parsing of DFS paths with Samba 3.0.24 server.
+                a problem with parsing of DFS paths with Samba server
+                versions 3.0.24 and 3.0.25.
 remount        remount the share (often used to change from ro to rw mounts
                or vice versa)
 cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index e1c18362ba4..85c0a74d034 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -122,7 +122,7 @@ static char *compose_mount_options(const char *sb_mountdata,
                                   char **devname)
 {
        int rc;
-        char *mountdata;
+        char *mountdata = NULL;
        int md_len;
        char *tkn_e;
        char *srvIP = NULL;
@@ -136,10 +136,9 @@ static char *compose_mount_options(const char *sb_mountdata,
        *devname = cifs_get_share_name(ref->node_name);
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
-                cERROR(1, ("%s: Failed to resolve server part of %s to IP",
+                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
-                          __func__, *devname));
+                          __func__, *devname, rc));;
-                mountdata = ERR_PTR(rc);
+                goto compose_mount_options_err;
-                goto compose_mount_options_out;
        }
        /* md_len = strlen(...) + 12 for 'sep+prefixpath='
         * assuming that we have 'unc=' and 'ip=' in
@@ -149,8 +148,8 @@ static char *compose_mount_options(const char *sb_mountdata,
                strlen(ref->node_name) + 12;
        mountdata = kzalloc(md_len+1, GFP_KERNEL);
        if (mountdata == NULL) {
-                mountdata = ERR_PTR(-ENOMEM);
+                rc = -ENOMEM;
-                goto compose_mount_options_out;
+                goto compose_mount_options_err;
        }
        /* copy all options except of unc,ip,prefixpath */
@@ -197,18 +196,32 @@ static char *compose_mount_options(const char *sb_mountdata,
        /* find & copy prefixpath */
        tkn_e = strchr(ref->node_name + 2, '\\');
-        if (tkn_e == NULL) /* invalid unc, missing share name*/
+        if (tkn_e == NULL) {
-                goto compose_mount_options_out;
+                /* invalid unc, missing share name*/
+                rc = -EINVAL;
+                goto compose_mount_options_err;
+        }
+        /*
+         * this function gives us a path with a double backslash prefix. We
+         * require a single backslash for DFS. Temporarily increment fullpath
+         * to put it in the proper form and decrement before freeing it.
+         */
        fullpath = build_path_from_dentry(dentry);
+        if (!fullpath) {
+                rc = -ENOMEM;
+                goto compose_mount_options_err;
+        }
+        ++fullpath;
        tkn_e = strchr(tkn_e + 1, '\\');
-        if (tkn_e || strlen(fullpath) - (ref->path_consumed)) {
+        if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
                strncat(mountdata, &sep, 1);
                strcat(mountdata, "prefixpath=");
                if (tkn_e)
                        strcat(mountdata, tkn_e + 1);
-                strcat(mountdata, fullpath + (ref->path_consumed));
+                strcat(mountdata, fullpath + ref->path_consumed);
        }
+        --fullpath;
        kfree(fullpath);
        /*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
@@ -217,6 +230,11 @@ static char *compose_mount_options(const char *sb_mountdata,
 compose_mount_options_out:
        kfree(srvIP);
        return mountdata;
+compose_mount_options_err:
+        kfree(mountdata);
+        mountdata = ERR_PTR(rc);
+        goto compose_mount_options_out;
 }
@@ -309,13 +327,19 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
                goto out_err;
        }
+        /*
+         * The MSDFS spec states that paths in DFS referral requests and
+         * responses must be prefixed by a single '\' character instead of
+         * the double backslashes usually used in the UNC. This function
+         * gives us the latter, so we must adjust the result.
+         */
        full_path = build_path_from_dentry(dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
                goto out_err;
        }
-        rc = get_dfs_path(xid, ses , full_path, cifs_sb->local_nls,
+        rc = get_dfs_path(xid, ses , full_path + 1, cifs_sb->local_nls,
                &num_referrals, &referrals,
                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 877c85409f1..c4c306f7b06 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -19,8 +19,8 @@
 #define _CIFS_FS_SB_H
 #define CIFS_MOUNT_NO_PERM      1 /* do not do client vfs_perm check */
-#define CIFS_MOUNT_SET_UID      2 /* set current->euid in create etc. */
+#define CIFS_MOUNT_SET_UID      2 /* set current's euid in create etc. */
-#define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server */
+#define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server  */
 #define CIFS_MOUNT_DIRECT_IO    8 /* do not write nor read through page cache */
 #define CIFS_MOUNT_NO_XATTR     0x10  /* if set - disable xattr support       */
 #define CIFS_MOUNT_MAP_SPECIAL_CHR 0x20 /* remap illegal chars in filenames   */
@@ -30,7 +30,8 @@
 #define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
 #define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
 #define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
-#define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting */
+#define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
+#define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
 struct cifs_sb_info {
        struct cifsTconInfo *tcon;      /* primary mount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 0ab2fb5afef..3fd3a9df043 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -121,11 +121,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        /* add the server address */
        if (server->addr.sockAddr.sin_family == AF_INET)
-                sprintf(dp, "ip4=" NIPQUAD_FMT,
+                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
-                        NIPQUAD(server->addr.sockAddr.sin_addr));
        else if (server->addr.sockAddr.sin_family == AF_INET6)
-                sprintf(dp, "ip6=" NIP6_SEQFMT,
+                sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
-                        NIP6(server->addr.sockAddr6.sin6_addr));
        else
                goto out;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index bd5f13d3845..d4839cf0cb2 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -37,7 +37,7 @@
 extern void mdfour(unsigned char *out, unsigned char *in, int n);
 extern void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, unsigned char *c8,
+extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                       unsigned char *p24);
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
@@ -280,25 +280,22 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
 }
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
+void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
+                        char *lnm_session_key)
 {
        int i;
        char password_with_pad[CIFS_ENCPWD_SIZE];
-        if (ses->server == NULL)
-                return;
        memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
-        if (ses->password)
+        if (password)
-                strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE);
+                strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
-        if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
+        if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) {
-                if (extended_security & CIFSSEC_MAY_PLNTXT) {
+                memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
-                        memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
+                memcpy(lnm_session_key, password_with_pad,
-                        memcpy(lnm_session_key, password_with_pad,
+                        CIFS_ENCPWD_SIZE);
-                                CIFS_ENCPWD_SIZE);
+                return;
-                        return;
+        }
-                }
        /* calculate old style session key */
        /* calling toupper is less broken than repeatedly
@@ -314,7 +311,8 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
        for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
                password_with_pad[i] = toupper(password_with_pad[i]);
-        SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key);
+        SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
        /* clear password before we return/free memory */
        memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
 }
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
index 152fa2dcfc6..15d2ec00647 100644
--- a/fs/cifs/cifsencrypt.h
+++ b/fs/cifs/cifsencrypt.h
@@ -26,7 +26,8 @@
 extern void mdfour(unsigned char *out, unsigned char *in, int n);
 /* smbdes.c */
 extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24);
+extern void E_P24(unsigned char *p21, const unsigned char *c8,
+                  unsigned char *p24);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d9cf467309e..13ea53251dc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -66,7 +66,9 @@ unsigned int sign_CIFS_PDUs = 1;
 extern struct task_struct *oplockThread; /* remove sparse warning */
 struct task_struct *oplockThread = NULL;
 /* extern struct task_struct * dnotifyThread; remove sparse warning */
+#ifdef CONFIG_CIFS_EXPERIMENTAL
 static struct task_struct *dnotifyThread = NULL;
+#endif
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, int, 0);
@@ -337,39 +339,58 @@ static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
        struct cifs_sb_info *cifs_sb;
+        struct cifsTconInfo *tcon;
+        struct TCP_Server_Info *server;
        cifs_sb = CIFS_SB(m->mnt_sb);
        if (cifs_sb) {
-                if (cifs_sb->tcon) {
+                tcon = cifs_sb->tcon;
-/* BB add prepath to mount options displayed */
+                if (tcon) {
                        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
-                        if (cifs_sb->tcon->ses) {
+                        if (tcon->ses) {
-                                if (cifs_sb->tcon->ses->userName)
+                                if (tcon->ses->userName)
                                        seq_printf(s, ",username=%s",
-                                           cifs_sb->tcon->ses->userName);
+                                           tcon->ses->userName);
-                                if (cifs_sb->tcon->ses->domainName)
+                                if (tcon->ses->domainName)
                                        seq_printf(s, ",domain=%s",
-                                           cifs_sb->tcon->ses->domainName);
+                                           tcon->ses->domainName);
+                                server = tcon->ses->server;
+                                if (server) {
+                                        seq_printf(s, ",addr=");
+                                        switch (server->addr.sockAddr6.
+                                                sin6_family) {
+                                        case AF_INET6:
+                                                seq_printf(s, "%pI6",
+                                                           &server->addr.sockAddr6.sin6_addr);
+                                                break;
+                                        case AF_INET:
+                                                seq_printf(s, "%pI4",
+                                                           &server->addr.sockAddr.sin_addr.s_addr);
+                                                break;
+                                        }
+                                }
                        }
                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) ||
-                           !(cifs_sb->tcon->unix_ext))
+                           !(tcon->unix_ext))
                                seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
                        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
-                           !(cifs_sb->tcon->unix_ext))
+                           !(tcon->unix_ext))
                                seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
-                        if (!cifs_sb->tcon->unix_ext) {
+                        if (!tcon->unix_ext) {
                                seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
                                           cifs_sb->mnt_file_mode,
                                           cifs_sb->mnt_dir_mode);
                        }
-                        if (cifs_sb->tcon->seal)
+                        if (tcon->seal)
                                seq_printf(s, ",seal");
-                        if (cifs_sb->tcon->nocase)
+                        if (tcon->nocase)
                                seq_printf(s, ",nocase");
-                        if (cifs_sb->tcon->retry)
+                        if (tcon->retry)
                                seq_printf(s, ",hard");
                }
+                if (cifs_sb->prepath)
+                        seq_printf(s, ",prepath=%s", cifs_sb->prepath);
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
                        seq_printf(s, ",posixpaths");
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
@@ -417,9 +438,8 @@ int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
        xid = GetXid();
        if (pTcon) {
                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else {
+        } else
                rc = -EIO;
-        }
        FreeXid(xid);
        return rc;
@@ -441,9 +461,8 @@ int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
        xid = GetXid();
        if (pTcon) {
                cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
-        } else {
+        } else
                rc = -EIO;
-        }
        FreeXid(xid);
        return rc;
@@ -464,9 +483,8 @@ int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
        xid = GetXid();
        if (pTcon) {
                cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
-        } else {
+        } else
                rc = -EIO;
-        }
        FreeXid(xid);
        return rc;
@@ -479,17 +497,16 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifsTconInfo *pTcon;
-        if (cifs_sb) {
+        if (cifs_sb)
                pTcon = cifs_sb->tcon;
-        } else {
+        else
                return -EIO;
-        }
        xid = GetXid();
        if (pTcon) {
                cFYI(1, ("pqstats %p", qstats));
-        } else {
+        } else
                rc = -EIO;
-        }
        FreeXid(xid);
        return rc;
@@ -730,7 +747,6 @@ const struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -751,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -772,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -792,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -801,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
        .readdir = cifs_readdir,
        .release = cifs_closedir,
        .read    = generic_read_dir,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
        .unlocked_ioctl  = cifs_ioctl,
        .llseek = generic_file_llseek,
 };
@@ -1029,6 +1039,7 @@ static int cifs_oplock_thread(void *dummyarg)
        return 0;
 }
+#ifdef CONFIG_CIFS_EXPERIMENTAL
 static int cifs_dnotify_thread(void *dummyarg)
 {
        struct list_head *tmp;
@@ -1054,6 +1065,7 @@ static int cifs_dnotify_thread(void *dummyarg)
        return 0;
 }
+#endif
 static int __init
 init_cifs(void)
@@ -1131,16 +1143,20 @@ init_cifs(void)
                goto out_unregister_dfs_key_type;
        }
+#ifdef CONFIG_CIFS_EXPERIMENTAL
        dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd");
        if (IS_ERR(dnotifyThread)) {
                rc = PTR_ERR(dnotifyThread);
                cERROR(1, ("error %d create dnotify thread", rc));
                goto out_stop_oplock_thread;
        }
+#endif
        return 0;
+#ifdef CONFIG_CIFS_EXPERIMENTAL
 out_stop_oplock_thread:
+#endif
        kthread_stop(oplockThread);
 out_unregister_dfs_key_type:
 #ifdef CONFIG_CIFS_DFS_UPCALL
@@ -1179,8 +1195,10 @@ exit_cifs(void)
        cifs_destroy_inodecache();
        cifs_destroy_mids();
        cifs_destroy_request_bufs();
-        kthread_stop(oplockThread);
+#ifdef CONFIG_CIFS_EXPERIMENTAL
        kthread_stop(dnotifyThread);
+#endif
+        kthread_stop(oplockThread);
 }
 MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 074de0b5064..7ac481841f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
-extern int cifs_dir_notify(struct file *, unsigned long arg);
 /* Functions related to dir entries */
 extern struct dentry_operations cifs_dentry_ops;
@@ -101,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.55"
+#define CIFS_VERSION   "1.56"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c57c0565547..94c1ca0ec95 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -47,7 +47,11 @@
 */
 #define CIFS_MAX_REQ 50
-#define SERVER_NAME_LENGTH 15
+#define RFC1001_NAME_LEN 15
+#define RFC1001_NAME_LEN_WITH_NULL (RFC1001_NAME_LEN + 1)
+/* currently length of NIP6_FMT */
+#define SERVER_NAME_LENGTH 40
 #define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
 /* used to define string lengths for reversing unicode strings */
@@ -125,8 +129,7 @@ struct TCP_Server_Info {
        struct list_head smb_ses_list;
        int srv_count; /* reference counter */
        /* 15 character server name + 0x20 16th byte indicating type = srv */
-        char server_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
+        char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
-        char unicode_server_Name[SERVER_NAME_LEN_WITH_NULL * 2];
        char *hostname; /* hostname portion of UNC string */
        struct socket *ssocket;
        union {
@@ -151,7 +154,7 @@ struct TCP_Server_Info {
        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
 #endif
        enum statusEnum tcpStatus; /* what we think the status is */
-        struct semaphore tcpSem;
+        struct mutex srv_mutex;
        struct task_struct *tsk;
        char server_GUID[16];
        char secMode;
@@ -171,7 +174,7 @@ struct TCP_Server_Info {
        __u16 CurrentMid;         /* multiplex id - rotating counter */
        char cryptKey[CIFS_CRYPTO_KEY_SIZE];
        /* 16th byte of RFC1001 workstation name is always null */
-        char workstation_RFC1001_name[SERVER_NAME_LEN_WITH_NULL];
+        char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
        __u32 sequence_number; /* needed for CIFS PDU signature */
        struct mac_key mac_signing_key;
        char ntlmv2_hash[16];
@@ -239,6 +242,7 @@ struct cifsTconInfo {
        struct cifsSesInfo *ses;        /* pointer to session associated with */
        char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */
        char *nativeFileSystem;
+        char *password;         /* for share-level security */
        __u16 tid;              /* The 2 byte tree id */
        __u16 Flags;            /* optional support bits */
        enum statusEnum tidStatus;
@@ -422,7 +426,6 @@ struct mid_q_entry {
        unsigned long when_sent; /* time when smb send finished */
        unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
-        struct cifsSesInfo *ses;        /* smb was sent to this server */
        struct task_struct *tsk;        /* task waiting for response */
        struct smb_hdr *resp_buf;       /* response buffer */
        int midState;   /* wish this were enum but can not pass to wait_event */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index d2a073edd1b..b4e2e9f0ee3 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1922,7 +1922,7 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
 /* DFS server target type */
 #define DFS_TYPE_LINK 0x0000  /* also for sysvol targets */
 #define DFS_TYPE_ROOT 0x0001
- 
 /* Referral Entry Flags */
 #define DFS_NAME_LIST_REF 0x0200
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6f21ecb85ce..06f6779988b 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,7 +39,7 @@ extern int smb_send(struct socket *, struct smb_hdr *,
                        unsigned int /* length */ , struct sockaddr *, bool);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
-#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current->fsuid));
+#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
@@ -330,7 +330,8 @@ extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
 extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
                             const struct nls_table *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-extern void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key);
+extern void calc_lanman_hash(const char *password, const char *cryptkey,
+                                bool encrypt, char *lnm_session_key);
 #endif /* CIFS_WEAK_PW_HASH */
 extern int CIFSSMBCopy(int xid,
                        struct cifsTconInfo *source_tcon,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 2af8626ced4..552642a507c 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1382,13 +1382,13 @@ openRetry:
                if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
                        *pOplock |= CIFS_CREATE_ACTION;
                if (pfile_info) {
-                    memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
+                        memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
-                        36 /* CreationTime to Attributes */);
+                                36 /* CreationTime to Attributes */);
-                    /* the file_info buf is endian converted by caller */
+                        /* the file_info buf is endian converted by caller */
-                    pfile_info->AllocationSize = pSMBr->AllocationSize;
+                        pfile_info->AllocationSize = pSMBr->AllocationSize;
-                    pfile_info->EndOfFile = pSMBr->EndOfFile;
+                        pfile_info->EndOfFile = pSMBr->EndOfFile;
-                    pfile_info->NumberOfLinks = cpu_to_le32(1);
+                        pfile_info->NumberOfLinks = cpu_to_le32(1);
-                    pfile_info->DeletePending = 0;
+                        pfile_info->DeletePending = 0;
                }
        }
@@ -1414,8 +1414,13 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        cFYI(1, ("Reading %d bytes on fid %d", count, netfid));
        if (tcon->ses->capabilities & CAP_LARGE_FILES)
                wct = 12;
-        else
+        else {
                wct = 10; /* old style read */
+                if ((lseek >> 32) > 0)  {
+                        /* can not handle this big offset for old */
+                        return -EIO;
+                }
+        }
        *nbytes = 0;
        rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **) &pSMB);
@@ -1431,8 +1436,6 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
        pSMB->OffsetLow = cpu_to_le32(lseek & 0xFFFFFFFF);
        if (wct == 12)
                pSMB->OffsetHigh = cpu_to_le32(lseek >> 32);
-        else if ((lseek >> 32) > 0) /* can not handle this big offset for old */
-                return -EIO;
        pSMB->Remaining = 0;
        pSMB->MaxCount = cpu_to_le16(count & 0xFFFF);
@@ -1519,8 +1522,13 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        if (tcon->ses->capabilities & CAP_LARGE_FILES)
                wct = 14;
-        else
+        else {
                wct = 12;
+                if ((offset >> 32) > 0) {
+                        /* can not handle big offset for old srv */
+                        return -EIO;
+                }
+        }
        rc = smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB,
                      (void **) &pSMBr);
@@ -1535,8 +1543,6 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
        if (wct == 14)
                pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
-        else if ((offset >> 32) > 0) /* can not handle big offset for old srv */
-                return -EIO;
        pSMB->Reserved = 0xFFFFFFFF;
        pSMB->WriteMode = 0;
@@ -1558,7 +1564,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
        pSMB->DataOffset =
                cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
        if (buf)
-            memcpy(pSMB->Data, buf, bytes_sent);
+                memcpy(pSMB->Data, buf, bytes_sent);
        else if (ubuf) {
                if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) {
                        cifs_buf_release(pSMB);
@@ -1621,10 +1627,15 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
-        if (tcon->ses->capabilities & CAP_LARGE_FILES)
+        if (tcon->ses->capabilities & CAP_LARGE_FILES) {
                wct = 14;
-        else
+        } else {
                wct = 12;
+                if ((offset >> 32) > 0) {
+                        /* can not handle big offset for old srv */
+                        return -EIO;
+                }
+        }
        rc = small_smb_init(SMB_COM_WRITE_ANDX, wct, tcon, (void **) &pSMB);
        if (rc)
                return rc;
@@ -1637,8 +1648,6 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
        pSMB->OffsetLow = cpu_to_le32(offset & 0xFFFFFFFF);
        if (wct == 14)
                pSMB->OffsetHigh = cpu_to_le32(offset >> 32);
-        else if ((offset >> 32) > 0) /* can not handle big offset for old srv */
-                return -EIO;
        pSMB->Reserved = 0xFFFFFFFF;
        pSMB->WriteMode = 0;
        pSMB->Remaining = 0;
@@ -1862,10 +1871,6 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                        rc = -EIO;      /* bad smb */
                        goto plk_err_exit;
                }
-                if (pLockData == NULL) {
-                        rc = -EINVAL;
-                        goto plk_err_exit;
-                }
                data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
                data_count  = le16_to_cpu(pSMBr->t2.DataCount);
                if (data_count < sizeof(struct cifs_posix_lock)) {
@@ -3983,7 +3988,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                node->flags = le16_to_cpu(pSMBr->DFSFlags);
                if (is_unicode) {
-                        __le16 *tmp = kmalloc(strlen(searchName)*2, GFP_KERNEL);
+                        __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
+                                                GFP_KERNEL);
                        cifsConvertToUCS((__le16 *) tmp, searchName,
                                        PATH_MAX, nls_codepage, remap);
                        node->path_consumed = hostlen_fromUCS(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c7d34171458..e9ea394ee07 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -89,6 +89,7 @@ struct smb_vol {
        bool nullauth:1;   /* attempt to authenticate with null user */
        bool nocase:1;     /* request case insensitive filenames */
        bool nobrl:1;      /* disable sending byte range locks to srv */
+        bool mand_lock:1;  /* send mandatory not posix byte range lock reqs */
        bool seal:1;       /* request transport encryption on share */
        bool nodfs:1;      /* Do not request DFS, even if available */
        bool local_lease:1; /* check leases only on local system, not remote */
@@ -101,25 +102,17 @@ struct smb_vol {
        char *prepath;
 };
-static int ipv4_connect(struct sockaddr_in *psin_server,
+static int ipv4_connect(struct TCP_Server_Info *server);
-                        struct socket **csocket,
+static int ipv6_connect(struct TCP_Server_Info *server);
-                        char *netb_name,
-                        char *server_netb_name,
-                        bool noblocksnd,
-                        bool nosndbuf); /* ipv6 never set sndbuf size */
-static int ipv6_connect(struct sockaddr_in6 *psin_server,
-                        struct socket **csocket, bool noblocksnd);
-        /*
-         * cifs tcp session reconnection
-         *
-         * mark tcp session as reconnecting so temporarily locked
-         * mark all smb sessions as reconnecting for tcp session
-         * reconnect tcp session
-         * wake up waiters on reconnection? - (not needed currently)
-         */
+/*
+ * cifs tcp session reconnection
+ *
+ * mark tcp session as reconnecting so temporarily locked
+ * mark all smb sessions as reconnecting for tcp session
+ * reconnect tcp session
+ * wake up waiters on reconnection? - (not needed currently)
+ */
 static int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
@@ -156,7 +149,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        }
        read_unlock(&cifs_tcp_ses_lock);
        /* do not want to be sending data on a socket we are freeing */
-        down(&server->tcpSem);
+        mutex_lock(&server->srv_mutex);
        if (server->ssocket) {
                cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
                        server->ssocket->flags));
@@ -182,21 +175,15 @@ cifs_reconnect(struct TCP_Server_Info *server)
                }
        }
        spin_unlock(&GlobalMid_Lock);
-        up(&server->tcpSem);
+        mutex_unlock(&server->srv_mutex);
        while ((server->tcpStatus != CifsExiting) &&
               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
-                if (server->addr.sockAddr6.sin6_family == AF_INET6) {
+                if (server->addr.sockAddr6.sin6_family == AF_INET6)
-                        rc = ipv6_connect(&server->addr.sockAddr6,
+                        rc = ipv6_connect(server);
-                                          &server->ssocket, server->noautotune);
+                else
-                } else {
+                        rc = ipv4_connect(server);
-                        rc = ipv4_connect(&server->addr.sockAddr,
-                                        &server->ssocket,
-                                        server->workstation_RFC1001_name,
-                                        server->server_RFC1001_name,
-                                        server->noblocksnd, server->noautotune);
-                }
                if (rc) {
                        cFYI(1, ("reconnect error %d", rc));
                        msleep(3000);
@@ -776,7 +763,7 @@ multi_t2_fnd:
                set_current_state(TASK_RUNNING);
        }
-        return 0;
+        module_put_and_exit(0);
 }
 /* extract the host portion of the UNC string */
@@ -836,8 +823,8 @@ cifs_parse_mount_options(char *options, const char *devname,
        /* null target name indicates to use *SMBSERVR default called name
           if we end up sending RFC1001 session initialize */
        vol->target_rfc1001_name[0] = 0;
-        vol->linux_uid = current->uid;  /* current->euid instead? */
+        vol->linux_uid = current_uid();  /* use current_euid() instead? */
-        vol->linux_gid = current->gid;
+        vol->linux_gid = current_gid();
        vol->dir_mode = S_IRWXUGO;
        /* 2767 perms indicate mandatory locking support */
        vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
@@ -1260,6 +1247,17 @@ cifs_parse_mount_options(char *options, const char *devname,
                        if (vol->file_mode ==
                                (S_IALLUGO & ~(S_ISUID | S_IXGRP)))
                                vol->file_mode = S_IALLUGO;
+                } else if (strnicmp(data, "forcemandatorylock", 9) == 0) {
+                        /* will take the shorter form "forcemand" as well */
+                        /* This mount option will force use of mandatory
+                          (DOS/Windows style) byte range locks, instead of
+                          using posix advisory byte range locks, even if the
+                          Unix extensions are available and posix locks would
+                          be supported otherwise. If Unix extensions are not
+                          negotiated this has no effect since mandatory locks
+                          would be used (mandatory locks is all that those
+                          those servers support) */
+                        vol->mand_lock = 1;
                } else if (strnicmp(data, "setuids", 7) == 0) {
                        vol->setuids = 1;
                } else if (strnicmp(data, "nosetuids", 9) == 0) {
@@ -1417,6 +1415,143 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
                force_sig(SIGKILL, task);
 }
+static struct TCP_Server_Info *
+cifs_get_tcp_session(struct smb_vol *volume_info)
+{
+        struct TCP_Server_Info *tcp_ses = NULL;
+        struct sockaddr addr;
+        struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
+        struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
+        int rc;
+        memset(&addr, 0, sizeof(struct sockaddr));
+        if (volume_info->UNCip && volume_info->UNC) {
+                rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
+                                    &sin_server->sin_addr.s_addr);
+                if (rc <= 0) {
+                        /* not ipv4 address, try ipv6 */
+                        rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
+                                            &sin_server6->sin6_addr.in6_u);
+                        if (rc > 0)
+                                addr.sa_family = AF_INET6;
+                } else {
+                        addr.sa_family = AF_INET;
+                }
+                if (rc <= 0) {
+                        /* we failed translating address */
+                        rc = -EINVAL;
+                        goto out_err;
+                }
+                cFYI(1, ("UNC: %s ip: %s", volume_info->UNC,
+                         volume_info->UNCip));
+        } else if (volume_info->UNCip) {
+                /* BB using ip addr as tcp_ses name to connect to the
+                   DFS root below */
+                cERROR(1, ("Connecting to DFS root not implemented yet"));
+                rc = -EINVAL;
+                goto out_err;
+        } else /* which tcp_sess DFS root would we conect to */ {
+                cERROR(1,
+                       ("CIFS mount error: No UNC path (e.g. -o "
+                        "unc=//192.168.1.100/public) specified"));
+                rc = -EINVAL;
+                goto out_err;
+        }
+        /* see if we already have a matching tcp_ses */
+        tcp_ses = cifs_find_tcp_session(&addr);
+        if (tcp_ses)
+                return tcp_ses;
+        tcp_ses = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL);
+        if (!tcp_ses) {
+                rc = -ENOMEM;
+                goto out_err;
+        }
+        tcp_ses->hostname = extract_hostname(volume_info->UNC);
+        if (IS_ERR(tcp_ses->hostname)) {
+                rc = PTR_ERR(tcp_ses->hostname);
+                goto out_err;
+        }
+        tcp_ses->noblocksnd = volume_info->noblocksnd;
+        tcp_ses->noautotune = volume_info->noautotune;
+        atomic_set(&tcp_ses->inFlight, 0);
+        init_waitqueue_head(&tcp_ses->response_q);
+        init_waitqueue_head(&tcp_ses->request_q);
+        INIT_LIST_HEAD(&tcp_ses->pending_mid_q);
+        mutex_init(&tcp_ses->srv_mutex);
+        memcpy(tcp_ses->workstation_RFC1001_name,
+                volume_info->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+        memcpy(tcp_ses->server_RFC1001_name,
+                volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+        tcp_ses->sequence_number = 0;
+        INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
+        INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
+        /*
+         * at this point we are the only ones with the pointer
+         * to the struct since the kernel thread not created yet
+         * no need to spinlock this init of tcpStatus or srv_count
+         */
+        tcp_ses->tcpStatus = CifsNew;
+        ++tcp_ses->srv_count;
+        if (addr.sa_family == AF_INET6) {
+                cFYI(1, ("attempting ipv6 connect"));
+                /* BB should we allow ipv6 on port 139? */
+                /* other OS never observed in Wild doing 139 with v6 */
+                memcpy(&tcp_ses->addr.sockAddr6, sin_server6,
+                        sizeof(struct sockaddr_in6));
+                sin_server6->sin6_port = htons(volume_info->port);
+                rc = ipv6_connect(tcp_ses);
+        } else {
+                memcpy(&tcp_ses->addr.sockAddr, sin_server,
+                        sizeof(struct sockaddr_in));
+                sin_server->sin_port = htons(volume_info->port);
+                rc = ipv4_connect(tcp_ses);
+        }
+        if (rc < 0) {
+                cERROR(1, ("Error connecting to socket. Aborting operation"));
+                goto out_err;
+        }
+        /*
+         * since we're in a cifs function already, we know that
+         * this will succeed. No need for try_module_get().
+         */
+        __module_get(THIS_MODULE);
+        tcp_ses->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread,
+                                  tcp_ses, "cifsd");
+        if (IS_ERR(tcp_ses->tsk)) {
+                rc = PTR_ERR(tcp_ses->tsk);
+                cERROR(1, ("error %d create cifsd thread", rc));
+                module_put(THIS_MODULE);
+                goto out_err;
+        }
+        /* thread spawned, put it on the list */
+        write_lock(&cifs_tcp_ses_lock);
+        list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list);
+        write_unlock(&cifs_tcp_ses_lock);
+        return tcp_ses;
+out_err:
+        if (tcp_ses) {
+                kfree(tcp_ses->hostname);
+                if (tcp_ses->ssocket)
+                        sock_release(tcp_ses->ssocket);
+                kfree(tcp_ses);
+        }
+        return ERR_PTR(rc);
+}
 static struct cifsSesInfo *
 cifs_find_smb_ses(struct TCP_Server_Info *server, char *username)
 {
@@ -1593,93 +1728,96 @@ static void rfc1002mangle(char *target, char *source, unsigned int length)
 static int
-ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
+ipv4_connect(struct TCP_Server_Info *server)
-             char *netbios_name, char *target_name,
-             bool noblocksnd, bool noautotune)
 {
        int rc = 0;
-        int connected = 0;
+        bool connected = false;
        __be16 orig_port = 0;
+        struct socket *socket = server->ssocket;
-        if (*csocket == NULL) {
+        if (socket == NULL) {
                rc = sock_create_kern(PF_INET, SOCK_STREAM,
-                                      IPPROTO_TCP, csocket);
+                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
                        cERROR(1, ("Error %d creating socket", rc));
-                        *csocket = NULL;
                        return rc;
-                } else {
-                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                        cFYI(1, ("Socket created"));
-                        (*csocket)->sk->sk_allocation = GFP_NOFS;
-                        cifs_reclassify_socket4(*csocket);
                }
+                /* BB other socket options to set KEEPALIVE, NODELAY? */
+                cFYI(1, ("Socket created"));
+                server->ssocket = socket;
+                socket->sk->sk_allocation = GFP_NOFS;
+                cifs_reclassify_socket4(socket);
        }
-        psin_server->sin_family = AF_INET;
+        /* user overrode default port */
-        if (psin_server->sin_port) { /* user overrode default port */
+        if (server->addr.sockAddr.sin_port) {
-                rc = (*csocket)->ops->connect(*csocket,
+                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                (struct sockaddr *) psin_server,
+                                          &server->addr.sockAddr,
-                                sizeof(struct sockaddr_in), 0);
+                                          sizeof(struct sockaddr_in), 0);
                if (rc >= 0)
-                        connected = 1;
+                        connected = true;
        }
        if (!connected) {
                /* save original port so we can retry user specified port
                        later if fall back ports fail this time  */
-                orig_port = psin_server->sin_port;
+                orig_port = server->addr.sockAddr.sin_port;
                /* do not retry on the same port we just failed on */
-                if (psin_server->sin_port != htons(CIFS_PORT)) {
+                if (server->addr.sockAddr.sin_port != htons(CIFS_PORT)) {
-                        psin_server->sin_port = htons(CIFS_PORT);
+                        server->addr.sockAddr.sin_port = htons(CIFS_PORT);
+                        rc = socket->ops->connect(socket,
-                        rc = (*csocket)->ops->connect(*csocket,
+                                                (struct sockaddr *)
-                                        (struct sockaddr *) psin_server,
+                                                &server->addr.sockAddr,
-                                        sizeof(struct sockaddr_in), 0);
+                                                sizeof(struct sockaddr_in), 0);
                        if (rc >= 0)
-                                connected = 1;
+                                connected = true;
                }
        }
        if (!connected) {
-                psin_server->sin_port = htons(RFC1001_PORT);
+                server->addr.sockAddr.sin_port = htons(RFC1001_PORT);
-                rc = (*csocket)->ops->connect(*csocket, (struct sockaddr *)
+                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                              psin_server,
+                                              &server->addr.sockAddr,
                                              sizeof(struct sockaddr_in), 0);
                if (rc >= 0)
-                        connected = 1;
+                        connected = true;
        }
        /* give up here - unless we want to retry on different
                protocol families some day */
        if (!connected) {
                if (orig_port)
-                        psin_server->sin_port = orig_port;
+                        server->addr.sockAddr.sin_port = orig_port;
                cFYI(1, ("Error %d connecting to server via ipv4", rc));
-                sock_release(*csocket);
+                sock_release(socket);
-                *csocket = NULL;
+                server->ssocket = NULL;
                return rc;
        }
-        /* Eventually check for other socket options to change from
-                the default. sock_setsockopt not used because it expects
-                user space buffer */
+        /*
-         cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+         * Eventually check for other socket options to change from
-                 (*csocket)->sk->sk_sndbuf,
+         *  the default. sock_setsockopt not used because it expects
-                 (*csocket)->sk->sk_rcvbuf, (*csocket)->sk->sk_rcvtimeo));
+         *  user space buffer
-        (*csocket)->sk->sk_rcvtimeo = 7 * HZ;
+         */
-        if (!noblocksnd)
+        socket->sk->sk_rcvtimeo = 7 * HZ;
-                (*csocket)->sk->sk_sndtimeo = 3 * HZ;
+        socket->sk->sk_sndtimeo = 3 * HZ;
        /* make the bufsizes depend on wsize/rsize and max requests */
-        if (noautotune) {
+        if (server->noautotune) {
-                if ((*csocket)->sk->sk_sndbuf < (200 * 1024))
+                if (socket->sk->sk_sndbuf < (200 * 1024))
-                        (*csocket)->sk->sk_sndbuf = 200 * 1024;
+                        socket->sk->sk_sndbuf = 200 * 1024;
-                if ((*csocket)->sk->sk_rcvbuf < (140 * 1024))
+                if (socket->sk->sk_rcvbuf < (140 * 1024))
-                        (*csocket)->sk->sk_rcvbuf = 140 * 1024;
+                        socket->sk->sk_rcvbuf = 140 * 1024;
        }
+         cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+                 socket->sk->sk_sndbuf,
+                 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
        /* send RFC1001 sessinit */
-        if (psin_server->sin_port == htons(RFC1001_PORT)) {
+        if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
                /* some servers require RFC1001 sessinit before sending
                negprot - BB check reconnection in case where second
                sessinit is sent but no second negprot */
@@ -1689,31 +1827,42 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
                                       GFP_KERNEL);
                if (ses_init_buf) {
                        ses_init_buf->trailer.session_req.called_len = 32;
-                        if (target_name && (target_name[0] != 0)) {
+                        if (server->server_RFC1001_name &&
-                                rfc1002mangle(ses_init_buf->trailer.session_req.called_name,
+                            server->server_RFC1001_name[0] != 0)
-                                        target_name, 16);
+                                rfc1002mangle(ses_init_buf->trailer.
-                        } else {
+                                                session_req.called_name,
-                                rfc1002mangle(ses_init_buf->trailer.session_req.called_name,
+                                              server->server_RFC1001_name,
-                                        DEFAULT_CIFS_CALLED_NAME, 16);
+                                              RFC1001_NAME_LEN_WITH_NULL);
-                        }
+                        else
+                                rfc1002mangle(ses_init_buf->trailer.
+                                                session_req.called_name,
+                                              DEFAULT_CIFS_CALLED_NAME,
+                                              RFC1001_NAME_LEN_WITH_NULL);
                        ses_init_buf->trailer.session_req.calling_len = 32;
                        /* calling name ends in null (byte 16) from old smb
                        convention. */
-                        if (netbios_name && (netbios_name[0] != 0)) {
+                        if (server->workstation_RFC1001_name &&
-                                rfc1002mangle(ses_init_buf->trailer.session_req.calling_name,
+                            server->workstation_RFC1001_name[0] != 0)
-                                        netbios_name, 16);
+                                rfc1002mangle(ses_init_buf->trailer.
-                        } else {
+                                                session_req.calling_name,
-                                rfc1002mangle(ses_init_buf->trailer.session_req.calling_name,
+                                              server->workstation_RFC1001_name,
-                                        "LINUX_CIFS_CLNT", 16);
+                                              RFC1001_NAME_LEN_WITH_NULL);
-                        }
+                        else
+                                rfc1002mangle(ses_init_buf->trailer.
+                                                session_req.calling_name,
+                                              "LINUX_CIFS_CLNT",
+                                              RFC1001_NAME_LEN_WITH_NULL);
                        ses_init_buf->trailer.session_req.scope1 = 0;
                        ses_init_buf->trailer.session_req.scope2 = 0;
                        smb_buf = (struct smb_hdr *)ses_init_buf;
                        /* sizeof RFC1002_SESSION_REQUEST with no scope */
                        smb_buf->smb_buf_length = 0x81000044;
-                        rc = smb_send(*csocket, smb_buf, 0x44,
+                        rc = smb_send(socket, smb_buf, 0x44,
-                                (struct sockaddr *)psin_server, noblocksnd);
+                                (struct sockaddr *) &server->addr.sockAddr,
+                                server->noblocksnd);
                        kfree(ses_init_buf);
                        msleep(1); /* RFC1001 layer in at least one server
                                      requires very short break before negprot
@@ -1733,79 +1882,81 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
 }
 static int
-ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket,
+ipv6_connect(struct TCP_Server_Info *server)
-             bool noblocksnd)
 {
        int rc = 0;
-        int connected = 0;
+        bool connected = false;
        __be16 orig_port = 0;
+        struct socket *socket = server->ssocket;
-        if (*csocket == NULL) {
+        if (socket == NULL) {
                rc = sock_create_kern(PF_INET6, SOCK_STREAM,
-                                      IPPROTO_TCP, csocket);
+                                      IPPROTO_TCP, &socket);
                if (rc < 0) {
                        cERROR(1, ("Error %d creating ipv6 socket", rc));
-                        *csocket = NULL;
+                        socket = NULL;
                        return rc;
-                } else {
-                /* BB other socket options to set KEEPALIVE, NODELAY? */
-                         cFYI(1, ("ipv6 Socket created"));
-                        (*csocket)->sk->sk_allocation = GFP_NOFS;
-                        cifs_reclassify_socket6(*csocket);
                }
-        }
-        psin_server->sin6_family = AF_INET6;
+                /* BB other socket options to set KEEPALIVE, NODELAY? */
+                cFYI(1, ("ipv6 Socket created"));
+                server->ssocket = socket;
+                socket->sk->sk_allocation = GFP_NOFS;
+                cifs_reclassify_socket6(socket);
+        }
-        if (psin_server->sin6_port) { /* user overrode default port */
+        /* user overrode default port */
-                rc = (*csocket)->ops->connect(*csocket,
+        if (server->addr.sockAddr6.sin6_port) {
-                                (struct sockaddr *) psin_server,
+                rc = socket->ops->connect(socket,
+                                (struct sockaddr *) &server->addr.sockAddr6,
                                sizeof(struct sockaddr_in6), 0);
                if (rc >= 0)
-                        connected = 1;
+                        connected = true;
        }
        if (!connected) {
                /* save original port so we can retry user specified port
                        later if fall back ports fail this time  */
-                orig_port = psin_server->sin6_port;
+                orig_port = server->addr.sockAddr6.sin6_port;
                /* do not retry on the same port we just failed on */
-                if (psin_server->sin6_port != htons(CIFS_PORT)) {
+                if (server->addr.sockAddr6.sin6_port != htons(CIFS_PORT)) {
-                        psin_server->sin6_port = htons(CIFS_PORT);
+                        server->addr.sockAddr6.sin6_port = htons(CIFS_PORT);
+                        rc = socket->ops->connect(socket, (struct sockaddr *)
-                        rc = (*csocket)->ops->connect(*csocket,
+                                        &server->addr.sockAddr6,
-                                        (struct sockaddr *) psin_server,
                                        sizeof(struct sockaddr_in6), 0);
                        if (rc >= 0)
-                                connected = 1;
+                                connected = true;
                }
        }
        if (!connected) {
-                psin_server->sin6_port = htons(RFC1001_PORT);
+                server->addr.sockAddr6.sin6_port = htons(RFC1001_PORT);
-                rc = (*csocket)->ops->connect(*csocket, (struct sockaddr *)
+                rc = socket->ops->connect(socket, (struct sockaddr *)
-                                 psin_server, sizeof(struct sockaddr_in6), 0);
+                                &server->addr.sockAddr6,
+                                sizeof(struct sockaddr_in6), 0);
                if (rc >= 0)
-                        connected = 1;
+                        connected = true;
        }
        /* give up here - unless we want to retry on different
                protocol families some day */
        if (!connected) {
                if (orig_port)
-                        psin_server->sin6_port = orig_port;
+                        server->addr.sockAddr6.sin6_port = orig_port;
                cFYI(1, ("Error %d connecting to server via ipv6", rc));
-                sock_release(*csocket);
+                sock_release(socket);
-                *csocket = NULL;
+                server->ssocket = NULL;
                return rc;
        }
-        /* Eventually check for other socket options to change from
-                the default. sock_setsockopt not used because it expects
-                user space buffer */
-        (*csocket)->sk->sk_rcvtimeo = 7 * HZ;
-        if (!noblocksnd)
-                (*csocket)->sk->sk_sndtimeo = 3 * HZ;
+        /*
+         * Eventually check for other socket options to change from
+         * the default. sock_setsockopt not used because it expects
+         * user space buffer
+         */
+        socket->sk->sk_rcvtimeo = 7 * HZ;
+        socket->sk->sk_sndtimeo = 3 * HZ;
+        server->ssocket = socket;
        return rc;
 }
@@ -2011,6 +2162,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
        if (pvolume_info->nobrl)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+        if (pvolume_info->mand_lock)
+                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
        if (pvolume_info->cifs_acl)
                cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
        if (pvolume_info->override_uid)
@@ -2035,32 +2188,30 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 {
        int rc = 0;
        int xid;
-        struct socket *csocket = NULL;
+        struct smb_vol *volume_info;
-        struct sockaddr addr;
-        struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
-        struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
-        struct smb_vol volume_info;
        struct cifsSesInfo *pSesInfo = NULL;
        struct cifsTconInfo *tcon = NULL;
        struct TCP_Server_Info *srvTcp = NULL;
        xid = GetXid();
-/* cFYI(1, ("Entering cifs_mount. Xid: %d with: %s", xid, mount_data)); */
+        volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
+        if (!volume_info) {
+                rc = -ENOMEM;
+                goto out;
+        }
-        memset(&addr, 0, sizeof(struct sockaddr));
+        if (cifs_parse_mount_options(mount_data, devname, volume_info)) {
-        memset(&volume_info, 0, sizeof(struct smb_vol));
-        if (cifs_parse_mount_options(mount_data, devname, &volume_info)) {
                rc = -EINVAL;
                goto out;
        }
-        if (volume_info.nullauth) {
+        if (volume_info->nullauth) {
                cFYI(1, ("null user"));
-                volume_info.username = "";
+                volume_info->username = "";
-        } else if (volume_info.username) {
+        } else if (volume_info->username) {
                /* BB fixme parse for domain name here */
-                cFYI(1, ("Username: %s", volume_info.username));
+                cFYI(1, ("Username: %s", volume_info->username));
        } else {
                cifserror("No username specified");
        /* In userspace mount helper we can get user name from alternate
@@ -2069,139 +2220,29 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                goto out;
        }
-        if (volume_info.UNCip && volume_info.UNC) {
-                rc = cifs_inet_pton(AF_INET, volume_info.UNCip,
-                                    &sin_server->sin_addr.s_addr);
-                if (rc <= 0) {
-                        /* not ipv4 address, try ipv6 */
-                        rc = cifs_inet_pton(AF_INET6, volume_info.UNCip,
-                                            &sin_server6->sin6_addr.in6_u);
-                        if (rc > 0)
-                                addr.sa_family = AF_INET6;
-                } else {
-                        addr.sa_family = AF_INET;
-                }
-                if (rc <= 0) {
-                        /* we failed translating address */
-                        rc = -EINVAL;
-                        goto out;
-                }
-                cFYI(1, ("UNC: %s ip: %s", volume_info.UNC, volume_info.UNCip));
-                /* success */
-                rc = 0;
-        } else if (volume_info.UNCip) {
-                /* BB using ip addr as server name to connect to the
-                   DFS root below */
-                cERROR(1, ("Connecting to DFS root not implemented yet"));
-                rc = -EINVAL;
-                goto out;
-        } else /* which servers DFS root would we conect to */ {
-                cERROR(1,
-                       ("CIFS mount error: No UNC path (e.g. -o "
-                        "unc=//192.168.1.100/public) specified"));
-                rc = -EINVAL;
-                goto out;
-        }
        /* this is needed for ASCII cp to Unicode converts */
-        if (volume_info.iocharset == NULL) {
+        if (volume_info->iocharset == NULL) {
                cifs_sb->local_nls = load_nls_default();
        /* load_nls_default can not return null */
        } else {
-                cifs_sb->local_nls = load_nls(volume_info.iocharset);
+                cifs_sb->local_nls = load_nls(volume_info->iocharset);
                if (cifs_sb->local_nls == NULL) {
                        cERROR(1, ("CIFS mount error: iocharset %s not found",
-                                 volume_info.iocharset));
+                                 volume_info->iocharset));
                        rc = -ELIBACC;
                        goto out;
                }
        }
-        srvTcp = cifs_find_tcp_session(&addr);
+        /* get a reference to a tcp session */
-        if (!srvTcp) { /* create socket */
+        srvTcp = cifs_get_tcp_session(volume_info);
-                if (addr.sa_family == AF_INET6) {
+        if (IS_ERR(srvTcp)) {
-                        cFYI(1, ("attempting ipv6 connect"));
+                rc = PTR_ERR(srvTcp);
-                        /* BB should we allow ipv6 on port 139? */
+                goto out;
-                        /* other OS never observed in Wild doing 139 with v6 */
-                        sin_server6->sin6_port = htons(volume_info.port);
-                        rc = ipv6_connect(sin_server6, &csocket,
-                                        volume_info.noblocksnd);
-                } else {
-                        sin_server->sin_port = htons(volume_info.port);
-                        rc = ipv4_connect(sin_server, &csocket,
-                                  volume_info.source_rfc1001_name,
-                                  volume_info.target_rfc1001_name,
-                                  volume_info.noblocksnd,
-                                  volume_info.noautotune);
-                }
-                if (rc < 0) {
-                        cERROR(1, ("Error connecting to socket. "
-                                   "Aborting operation"));
-                        if (csocket != NULL)
-                                sock_release(csocket);
-                        goto out;
-                }
-                srvTcp = kzalloc(sizeof(struct TCP_Server_Info), GFP_KERNEL);
-                if (!srvTcp) {
-                        rc = -ENOMEM;
-                        sock_release(csocket);
-                        goto out;
-                } else {
-                        srvTcp->noblocksnd = volume_info.noblocksnd;
-                        srvTcp->noautotune = volume_info.noautotune;
-                        if (addr.sa_family == AF_INET6)
-                                memcpy(&srvTcp->addr.sockAddr6, sin_server6,
-                                        sizeof(struct sockaddr_in6));
-                        else
-                                memcpy(&srvTcp->addr.sockAddr, sin_server,
-                                        sizeof(struct sockaddr_in));
-                        atomic_set(&srvTcp->inFlight, 0);
-                        /* BB Add code for ipv6 case too */
-                        srvTcp->ssocket = csocket;
-                        srvTcp->hostname = extract_hostname(volume_info.UNC);
-                        if (IS_ERR(srvTcp->hostname)) {
-                                rc = PTR_ERR(srvTcp->hostname);
-                                sock_release(csocket);
-                                goto out;
-                        }
-                        init_waitqueue_head(&srvTcp->response_q);
-                        init_waitqueue_head(&srvTcp->request_q);
-                        INIT_LIST_HEAD(&srvTcp->pending_mid_q);
-                        /* at this point we are the only ones with the pointer
-                        to the struct since the kernel thread not created yet
-                        so no need to spinlock this init of tcpStatus */
-                        srvTcp->tcpStatus = CifsNew;
-                        init_MUTEX(&srvTcp->tcpSem);
-                        srvTcp->tsk = kthread_run((void *)(void *)cifs_demultiplex_thread, srvTcp, "cifsd");
-                        if (IS_ERR(srvTcp->tsk)) {
-                                rc = PTR_ERR(srvTcp->tsk);
-                                cERROR(1, ("error %d create cifsd thread", rc));
-                                srvTcp->tsk = NULL;
-                                sock_release(csocket);
-                                kfree(srvTcp->hostname);
-                                goto out;
-                        }
-                        rc = 0;
-                        memcpy(srvTcp->workstation_RFC1001_name,
-                                volume_info.source_rfc1001_name, 16);
-                        memcpy(srvTcp->server_RFC1001_name,
-                                volume_info.target_rfc1001_name, 16);
-                        srvTcp->sequence_number = 0;
-                        INIT_LIST_HEAD(&srvTcp->tcp_ses_list);
-                        INIT_LIST_HEAD(&srvTcp->smb_ses_list);
-                        ++srvTcp->srv_count;
-                        write_lock(&cifs_tcp_ses_lock);
-                        list_add(&srvTcp->tcp_ses_list,
-                                 &cifs_tcp_ses_list);
-                        write_unlock(&cifs_tcp_ses_lock);
-                }
        }
-        pSesInfo = cifs_find_smb_ses(srvTcp, volume_info.username);
+        pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username);
        if (pSesInfo) {
                cFYI(1, ("Existing smb sess found (status=%d)",
                        pSesInfo->status));
@@ -2228,31 +2269,38 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                /* new SMB session uses our srvTcp ref */
                pSesInfo->server = srvTcp;
-                sprintf(pSesInfo->serverName, "%u.%u.%u.%u",
+                if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
-                        NIPQUAD(sin_server->sin_addr.s_addr));
+                        sprintf(pSesInfo->serverName, "%pI6",
+                                &srvTcp->addr.sockAddr6.sin6_addr);
+                else
+                        sprintf(pSesInfo->serverName, "%pI4",
+                                &srvTcp->addr.sockAddr.sin_addr.s_addr);
                write_lock(&cifs_tcp_ses_lock);
                list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
                write_unlock(&cifs_tcp_ses_lock);
-                /* volume_info.password freed at unmount */
+                /* volume_info->password freed at unmount */
-                if (volume_info.password) {
+                if (volume_info->password) {
-                        pSesInfo->password = volume_info.password;
+                        pSesInfo->password = kstrdup(volume_info->password,
-                        /* set to NULL to prevent freeing on exit */
+                                                     GFP_KERNEL);
-                        volume_info.password = NULL;
+                        if (!pSesInfo->password) {
+                                rc = -ENOMEM;
+                                goto mount_fail_check;
+                        }
                }
-                if (volume_info.username)
+                if (volume_info->username)
-                        strncpy(pSesInfo->userName, volume_info.username,
+                        strncpy(pSesInfo->userName, volume_info->username,
                                MAX_USERNAME_SIZE);
-                if (volume_info.domainname) {
+                if (volume_info->domainname) {
-                        int len = strlen(volume_info.domainname);
+                        int len = strlen(volume_info->domainname);
                        pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
                        if (pSesInfo->domainName)
                                strcpy(pSesInfo->domainName,
-                                        volume_info.domainname);
+                                        volume_info->domainname);
                }
-                pSesInfo->linux_uid = volume_info.linux_uid;
+                pSesInfo->linux_uid = volume_info->linux_uid;
-                pSesInfo->overrideSecFlg = volume_info.secFlg;
+                pSesInfo->overrideSecFlg = volume_info->secFlg;
                down(&pSesInfo->sesSem);
                /* BB FIXME need to pass vol->secFlgs BB */
@@ -2263,14 +2311,14 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        /* search for existing tcon to this server share */
        if (!rc) {
-                setup_cifs_sb(&volume_info, cifs_sb);
+                setup_cifs_sb(volume_info, cifs_sb);
-                tcon = cifs_find_tcon(pSesInfo, volume_info.UNC);
+                tcon = cifs_find_tcon(pSesInfo, volume_info->UNC);
                if (tcon) {
                        cFYI(1, ("Found match on UNC path"));
                        /* existing tcon already has a reference */
                        cifs_put_smb_ses(pSesInfo);
-                        if (tcon->seal != volume_info.seal)
+                        if (tcon->seal != volume_info->seal)
                                cERROR(1, ("transport encryption setting "
                                           "conflicts with existing tid"));
                } else {
@@ -2279,11 +2327,20 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                rc = -ENOMEM;
                                goto mount_fail_check;
                        }
                        tcon->ses = pSesInfo;
+                        if (volume_info->password) {
+                                tcon->password = kstrdup(volume_info->password,
+                                                         GFP_KERNEL);
+                                if (!tcon->password) {
+                                        rc = -ENOMEM;
+                                        goto mount_fail_check;
+                                }
+                        }
                        /* check for null share name ie connect to dfs root */
-                        if ((strchr(volume_info.UNC + 3, '\\') == NULL)
+                        if ((strchr(volume_info->UNC + 3, '\\') == NULL)
-                            && (strchr(volume_info.UNC + 3, '/') == NULL)) {
+                            && (strchr(volume_info->UNC + 3, '/') == NULL)) {
                                /* rc = connect_to_dfs_path(...) */
                                cFYI(1, ("DFS root not supported"));
                                rc = -ENODEV;
@@ -2292,10 +2349,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                                /* BB Do we need to wrap sesSem around
                                 * this TCon call and Unix SetFS as
                                 * we do on SessSetup and reconnect? */
-                                rc = CIFSTCon(xid, pSesInfo, volume_info.UNC,
+                                rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
                                              tcon, cifs_sb->local_nls);
                                cFYI(1, ("CIFS Tcon rc = %d", rc));
-                                if (volume_info.nodfs) {
+                                if (volume_info->nodfs) {
                                        tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
                                        cFYI(1, ("DFS disabled (%d)",
                                                tcon->Flags));
@@ -2303,7 +2360,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                        }
                        if (rc)
                                goto mount_fail_check;
-                        tcon->seal = volume_info.seal;
+                        tcon->seal = volume_info->seal;
                        write_lock(&cifs_tcp_ses_lock);
                        list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
                        write_unlock(&cifs_tcp_ses_lock);
@@ -2313,9 +2370,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                   to a share so for resources mounted more than once
                   to the same server share the last value passed in
                   for the retry flag is used */
-                tcon->retry = volume_info.retry;
+                tcon->retry = volume_info->retry;
-                tcon->nocase = volume_info.nocase;
+                tcon->nocase = volume_info->nocase;
-                tcon->local_lease = volume_info.local_lease;
+                tcon->local_lease = volume_info->local_lease;
        }
        if (pSesInfo) {
                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
@@ -2352,7 +2409,7 @@ mount_fail_check:
        if (tcon->ses->capabilities & CAP_UNIX)
                /* reset of caps checks mount to see if unix extensions
                   disabled for just this mount */
-                reset_cifs_unix_caps(xid, tcon, sb, &volume_info);
+                reset_cifs_unix_caps(xid, tcon, sb, volume_info);
        else
                tcon->unix_ext = 0; /* server does not support them */
@@ -2371,18 +2428,22 @@ mount_fail_check:
                cifs_sb->rsize = min(cifs_sb->rsize,
                               (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
-        /* volume_info.password is freed above when existing session found
+        /* volume_info->password is freed above when existing session found
        (in which case it is not needed anymore) but when new sesion is created
        the password ptr is put in the new session structure (in which case the
        password will be freed at unmount time) */
 out:
        /* zero out password before freeing */
-        if (volume_info.password != NULL) {
+        if (volume_info) {
-                memset(volume_info.password, 0, strlen(volume_info.password));
+                if (volume_info->password != NULL) {
-                kfree(volume_info.password);
+                        memset(volume_info->password, 0,
+                                strlen(volume_info->password));
+                        kfree(volume_info->password);
+                }
+                kfree(volume_info->UNC);
+                kfree(volume_info->prepath);
+                kfree(volume_info);
        }
-        kfree(volume_info.UNC);
-        kfree(volume_info.prepath);
        FreeXid(xid);
        return rc;
 }
@@ -2533,7 +2594,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                __u16 action = le16_to_cpu(pSMBr->resp.Action);
                __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
                if (action & GUEST_LOGIN)
-                        cFYI(1, (" Guest login")); /* BB mark SesInfo struct? */
+                        cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
                ses->Suid = smb_buffer_response->Uid; /* UID left in wire format
                                                         (little endian) */
                cFYI(1, ("UID = %d ", ses->Suid));
@@ -2679,13 +2740,11 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                              len));
                        }
                } else {
-                        cERROR(1,
+                        cERROR(1, ("Security Blob Length extends beyond "
-                               (" Security Blob Length extends beyond "
                                "end of SMB"));
                }
        } else {
-                cERROR(1,
+                cERROR(1, ("Invalid Word count %d: ",
-                       (" Invalid Word count %d: ",
                        smb_buffer_response->WordCount));
                rc = -EIO;
        }
@@ -2843,7 +2902,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
                if (action & GUEST_LOGIN)
-                        cFYI(1, (" Guest login"));
+                        cFYI(1, ("Guest login"));
        /* Do we want to set anything in SesInfo struct when guest login? */
                bcc_ptr = pByteArea(smb_buffer_response);
@@ -2851,8 +2910,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                SecurityBlob2 = (PCHALLENGE_MESSAGE) bcc_ptr;
                if (SecurityBlob2->MessageType != NtLmChallenge) {
-                        cFYI(1,
+                        cFYI(1, ("Unexpected NTLMSSP message type received %d",
-                             ("Unexpected NTLMSSP message type received %d",
                              SecurityBlob2->MessageType));
                } else if (ses) {
                        ses->Suid = smb_buffer_response->Uid; /* UID left in le format */
@@ -3024,8 +3082,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                        cERROR(1, ("No session structure passed in."));
                }
        } else {
-                cERROR(1,
+                cERROR(1, ("Invalid Word count %d:",
-                       (" Invalid Word count %d:",
                        smb_buffer_response->WordCount));
                rc = -EIO;
        }
@@ -3264,7 +3321,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                __u16 action = le16_to_cpu(pSMBr->resp.Action);
                __u16 blob_len = le16_to_cpu(pSMBr->resp.SecurityBlobLength);
                if (action & GUEST_LOGIN)
-                        cFYI(1, (" Guest login")); /* BB Should we set anything
+                        cFYI(1, ("Guest login")); /* BB Should we set anything
                                                         in SesInfo struct ? */
 /*              if (SecurityBlob2->MessageType != NtLm??) {
                        cFYI("Unexpected message type on auth response is %d"));
@@ -3487,12 +3544,14 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
                if ((extended_security & CIFSSEC_MAY_LANMAN) &&
-                        (ses->server->secType == LANMAN))
+                    (ses->server->secType == LANMAN))
-                        calc_lanman_hash(ses, bcc_ptr);
+                        calc_lanman_hash(tcon->password, ses->server->cryptKey,
+                                         ses->server->secMode &
+                                            SECMODE_PW_ENCRYPT ? true : false,
+                                         bcc_ptr);
                else
 #endif /* CIFS_WEAK_PW_HASH */
-                SMBNTencrypt(ses->password,
+                SMBNTencrypt(tcon->password, ses->server->cryptKey,
-                             ses->server->cryptKey,
                             bcc_ptr);
                bcc_ptr += CIFS_SESS_KEY_SIZE;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e962e75e6f7..838d9c720a5 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -235,11 +235,11 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        };
                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                                args.uid = (__u64) current->fsuid;
+                                args.uid = (__u64) current_fsuid();
                                if (inode->i_mode & S_ISGID)
                                        args.gid = (__u64) inode->i_gid;
                                else
-                                        args.gid = (__u64) current->fsgid;
+                                        args.gid = (__u64) current_fsgid();
                        } else {
                                args.uid = NO_CHANGE_64;
                                args.gid = NO_CHANGE_64;
@@ -271,13 +271,13 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                                if ((oplock & CIFS_CREATE_ACTION) &&
                                    (cifs_sb->mnt_cifs_flags &
                                     CIFS_MOUNT_SET_UID)) {
-                                        newinode->i_uid = current->fsuid;
+                                        newinode->i_uid = current_fsuid();
                                        if (inode->i_mode & S_ISGID)
                                                newinode->i_gid =
                                                        inode->i_gid;
                                        else
                                                newinode->i_gid =
-                                                        current->fsgid;
+                                                        current_fsgid();
                                }
                        }
                }
@@ -375,8 +375,8 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
                        .device = device_number,
                };
                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                        args.uid = (__u64) current->fsuid;
+                        args.uid = (__u64) current_fsuid();
-                        args.gid = (__u64) current->fsgid;
+                        args.gid = (__u64) current_fsgid();
                } else {
                        args.uid = NO_CHANGE_64;
                        args.gid = NO_CHANGE_64;
@@ -483,7 +483,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        xid = GetXid();
-        cFYI(1, (" parent inode = 0x%p name is: %s and dentry = 0x%p",
+        cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p",
              parent_dir_inode, direntry->d_name.name, direntry));
        /* check whether path exists */
@@ -515,12 +515,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
        }
        if (direntry->d_inode != NULL) {
-                cFYI(1, (" non-NULL inode in lookup"));
+                cFYI(1, ("non-NULL inode in lookup"));
        } else {
-                cFYI(1, (" NULL inode in lookup"));
+                cFYI(1, ("NULL inode in lookup"));
        }
-        cFYI(1,
+        cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
-             (" Full path: %s inode = 0x%p", full_path, direntry->d_inode));
        if (pTcon->unix_ext)
                rc = cifs_get_inode_info_unix(&newInode, full_path,
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b..00000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *   fs/cifs/fcntl.c
- *
- *   vfs operations that deal with the file control API
- *
- *   Copyright (C) International Business Machines  Corp., 2003,2004
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "cifsfs.h"
-static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
-{
-        __u32 cifs_ntfy_flags = 0;
-        /* No way on Linux VFS to ask to monitor xattr
-        changes (and no stream support either */
-        if (fcntl_notify_flags & DN_ACCESS)
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
-        if (fcntl_notify_flags & DN_MODIFY) {
-                /* What does this mean on directories? */
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
-                        FILE_NOTIFY_CHANGE_SIZE;
-        }
-        if (fcntl_notify_flags & DN_CREATE) {
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
-                        FILE_NOTIFY_CHANGE_LAST_WRITE;
-        }
-        if (fcntl_notify_flags & DN_DELETE)
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
-        if (fcntl_notify_flags & DN_RENAME) {
-                /* BB review this - checking various server behaviors */
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
-                        FILE_NOTIFY_CHANGE_FILE_NAME;
-        }
-        if (fcntl_notify_flags & DN_ATTRIB) {
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
-                        FILE_NOTIFY_CHANGE_ATTRIBUTES;
-        }
-/*      if (fcntl_notify_flags & DN_MULTISHOT) {
-                cifs_ntfy_flags |= ;
-        } */ /* BB fixme - not sure how to handle this with CIFS yet */
-        return cifs_ntfy_flags;
-}
-int cifs_dir_notify(struct file *file, unsigned long arg)
-{
-        int xid;
-        int rc = -EINVAL;
-        int oplock = 0;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
-        char *full_path = NULL;
-        __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
-        __u16 netfid;
-        if (experimEnabled == 0)
-                return 0;
-        xid = GetXid();
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
-        full_path = build_path_from_dentry(file->f_path.dentry);
-        if (full_path == NULL) {
-                rc = -ENOMEM;
-        } else {
-                cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
-                rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-                        GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
-                        &netfid, &oplock, NULL, cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                /* BB fixme - add this handle to a notify handle list */
-                if (rc) {
-                        cFYI(1, ("Could not open directory for notify"));
-                } else {
-                        filter = convert_to_cifs_notify_flags(arg);
-                        if (filter != 0) {
-                                rc = CIFSSMBNotify(xid, pTcon,
-                                        0 /* no subdirs */, netfid,
-                                        filter, file, arg & DN_MULTISHOT,
-                                        cifs_sb->local_nls);
-                        } else {
-                                rc = -EINVAL;
-                        }
-                        /* BB add code to close file eventually (at unmount
-                        it would close automatically but may be a way
-                        to do it easily when inode freed or when
-                        notify info is cleared/changed */
-                        cFYI(1, ("notify rc %d", rc));
-                }
-        }
-        FreeXid(xid);
-        return rc;
-}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index f0a81e631ae..12bb656fbe7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -644,10 +644,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        __u64 length;
        bool wait_flag = false;
        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
+        struct cifsTconInfo *tcon;
        __u16 netfid;
        __u8 lockType = LOCKING_ANDX_LARGE_FILES;
-        bool posix_locking;
+        bool posix_locking = 0;
        length = 1 + pfLock->fl_end - pfLock->fl_start;
        rc = -EACCES;
@@ -698,7 +698,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                cFYI(1, ("Unknown type of lock"));
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
+        tcon = cifs_sb->tcon;
        if (file->private_data == NULL) {
                FreeXid(xid);
@@ -706,9 +706,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
        }
        netfid = ((struct cifsFileInfo *)file->private_data)->netfid;
-        posix_locking = (cifs_sb->tcon->ses->capabilities & CAP_UNIX) &&
+        if ((tcon->ses->capabilities & CAP_UNIX) &&
-                        (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability));
+            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+                posix_locking = 1;
        /* BB add code here to normalize offset and length to
        account for negative length which we can not accept over the
        wire */
@@ -719,7 +720,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                posix_lock_type = CIFS_RDLCK;
                        else
                                posix_lock_type = CIFS_WRLCK;
-                        rc = CIFSSMBPosixLock(xid, pTcon, netfid, 1 /* get */,
+                        rc = CIFSSMBPosixLock(xid, tcon, netfid, 1 /* get */,
                                        length, pfLock,
                                        posix_lock_type, wait_flag);
                        FreeXid(xid);
@@ -727,10 +728,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                }
                /* BB we could chain these into one lock request BB */
-                rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start,
+                rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start,
                                 0, 1, lockType, 0 /* wait flag */ );
                if (rc == 0) {
-                        rc = CIFSSMBLock(xid, pTcon, netfid, length,
+                        rc = CIFSSMBLock(xid, tcon, netfid, length,
                                         pfLock->fl_start, 1 /* numUnlock */ ,
                                         0 /* numLock */ , lockType,
                                         0 /* wait flag */ );
@@ -767,7 +768,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                if (numUnlock == 1)
                        posix_lock_type = CIFS_UNLCK;
-                rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */,
+                rc = CIFSSMBPosixLock(xid, tcon, netfid, 0 /* set */,
                                      length, pfLock,
                                      posix_lock_type, wait_flag);
        } else {
@@ -775,7 +776,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                        (struct cifsFileInfo *)file->private_data;
                if (numLock) {
-                        rc = CIFSSMBLock(xid, pTcon, netfid, length,
+                        rc = CIFSSMBLock(xid, tcon, netfid, length,
                                        pfLock->fl_start,
                                        0, numLock, lockType, wait_flag);
@@ -796,7 +797,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
                                if (pfLock->fl_start <= li->offset &&
                                                (pfLock->fl_start + length) >=
                                                (li->offset + li->length)) {
-                                        stored_rc = CIFSSMBLock(xid, pTcon,
+                                        stored_rc = CIFSSMBLock(xid, tcon,
                                                        netfid,
                                                        li->length, li->offset,
                                                        1, 0, li->type, false);
@@ -2073,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
        cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                rc = -ENOMEM;
                goto out;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index ff8c68de4a9..f247da9f4ed 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
 *   fs/cifs/inode.c
 *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
 *   Author(s): Steve French (sfrench@us.ibm.com)
 *
 *   This library is free software; you can redistribute it and/or modify
@@ -621,6 +621,47 @@ static const struct inode_operations cifs_ipc_inode_ops = {
        .lookup = cifs_lookup,
 };
+static char *build_path_to_root(struct cifs_sb_info *cifs_sb)
+{
+        int pplen = cifs_sb->prepathlen;
+        int dfsplen;
+        char *full_path = NULL;
+        /* if no prefix path, simply set path to the root of share to "" */
+        if (pplen == 0) {
+                full_path = kmalloc(1, GFP_KERNEL);
+                if (full_path)
+                        full_path[0] = 0;
+                return full_path;
+        }
+        if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
+                dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+        else
+                dfsplen = 0;
+        full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
+        if (full_path == NULL)
+                return full_path;
+        if (dfsplen) {
+                strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+                /* switch slash direction in prepath depending on whether
+                 * windows or posix style path names
+                 */
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+                        int i;
+                        for (i = 0; i < dfsplen; i++) {
+                                if (full_path[i] == '\\')
+                                        full_path[i] = '/';
+                        }
+                }
+        }
+        strncpy(full_path + dfsplen, cifs_sb->prepath, pplen);
+        full_path[dfsplen + pplen] = 0; /* add trailing null */
+        return full_path;
+}
 /* gets root inode */
 struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 {
@@ -628,6 +669,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
        struct cifs_sb_info *cifs_sb;
        struct inode *inode;
        long rc;
+        char *full_path;
        inode = iget_locked(sb, ino);
        if (!inode)
@@ -636,13 +678,17 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                return inode;
        cifs_sb = CIFS_SB(inode->i_sb);
-        xid = GetXid();
+        full_path = build_path_to_root(cifs_sb);
+        if (full_path == NULL)
+                return ERR_PTR(-ENOMEM);
+        xid = GetXid();
        if (cifs_sb->tcon->unix_ext)
-                rc = cifs_get_inode_info_unix(&inode, "", inode->i_sb, xid);
+                rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb,
+                                                xid);
        else
-                rc = cifs_get_inode_info(&inode, "", NULL, inode->i_sb, xid,
+                rc = cifs_get_inode_info(&inode, full_path, NULL, inode->i_sb,
-                                         NULL);
+                                                xid, NULL);
        if (rc && cifs_sb->tcon->ipc) {
                cFYI(1, ("ipc connection - fake read inode"));
                inode->i_mode |= S_IFDIR;
@@ -652,6 +698,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
        } else if (rc) {
+                kfree(full_path);
                _FreeXid(xid);
                iget_failed(inode);
                return ERR_PTR(rc);
@@ -659,6 +706,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
        unlock_new_inode(inode);
+        kfree(full_path);
        /* can not call macro FreeXid here since in a void func
         * TODO: This is no longer true
         */
@@ -1143,11 +1191,11 @@ mkdir_get_info:
                                .device = 0,
                        };
                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                                args.uid = (__u64)current->fsuid;
+                                args.uid = (__u64)current_fsuid();
                                if (inode->i_mode & S_ISGID)
                                        args.gid = (__u64)inode->i_gid;
                                else
-                                        args.gid = (__u64)current->fsgid;
+                                        args.gid = (__u64)current_fsgid();
                        } else {
                                args.uid = NO_CHANGE_64;
                                args.gid = NO_CHANGE_64;
@@ -1184,13 +1232,13 @@ mkdir_get_info:
                                if (cifs_sb->mnt_cifs_flags &
                                     CIFS_MOUNT_SET_UID) {
                                        direntry->d_inode->i_uid =
-                                                current->fsuid;
+                                                current_fsuid();
                                        if (inode->i_mode & S_ISGID)
                                                direntry->d_inode->i_gid =
                                                        inode->i_gid;
                                        else
                                                direntry->d_inode->i_gid =
-                                                        current->fsgid;
+                                                        current_fsgid();
                                }
                        }
                }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 0088a5b5256..f94650683a0 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -65,7 +65,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
        switch (command) {
                case CIFS_IOC_CHECKUMOUNT:
                        cFYI(1, ("User unmount attempted"));
-                        if (cifs_sb->mnt_uid == current->uid)
+                        if (cifs_sb->mnt_uid == current_uid())
                                rc = 0;
                        else {
                                rc = -EACCES;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 9ee3f689c2b..4c89c572891 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -97,7 +97,10 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
        kfree(buf_to_free->serverOS);
        kfree(buf_to_free->serverDomain);
        kfree(buf_to_free->serverNOS);
-        kfree(buf_to_free->password);
+        if (buf_to_free->password) {
+                memset(buf_to_free->password, 0, strlen(buf_to_free->password));
+                kfree(buf_to_free->password);
+        }
        kfree(buf_to_free->domainName);
        kfree(buf_to_free);
 }
@@ -129,6 +132,10 @@ tconInfoFree(struct cifsTconInfo *buf_to_free)
        }
        atomic_dec(&tconInfoAllocCount);
        kfree(buf_to_free->nativeFileSystem);
+        if (buf_to_free->password) {
+                memset(buf_to_free->password, 0, strlen(buf_to_free->password));
+                kfree(buf_to_free->password);
+        }
        kfree(buf_to_free);
 }
@@ -338,13 +345,13 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
                /*  BB Add support for establishing new tCon and SMB Session  */
                /*      with userid/password pairs found on the smb session   */
                /*      for other target tcp/ip addresses               BB    */
-                                if (current->fsuid != treeCon->ses->linux_uid) {
+                                if (current_fsuid() != treeCon->ses->linux_uid) {
                                        cFYI(1, ("Multiuser mode and UID "
                                                 "did not match tcon uid"));
                                        read_lock(&cifs_tcp_ses_lock);
                                        list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
                                                ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
-                                                if (ses->linux_uid == current->fsuid) {
+                                                if (ses->linux_uid == current_fsuid()) {
                                                        if (ses->server == treeCon->ses->server) {
                                                                cFYI(1, ("found matching uid substitute right smb_uid"));
                                                                buffer->Uid = ses->Suid;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 2851d5da0c8..5f22de7b79a 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -417,7 +417,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
                /* BB calculate hash with password */
                /* and copy into bcc */
-                calc_lanman_hash(ses, lnm_session_key);
+                calc_lanman_hash(ses->password, ses->server->cryptKey,
+                                 ses->server->secMode & SECMODE_PW_ENCRYPT ?
+                                        true : false, lnm_session_key);
                ses->flags |= CIFS_SES_LANMAN;
                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
                bcc_ptr += CIFS_SESS_KEY_SIZE;
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 04943c976f9..224a1f47896 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -318,7 +318,8 @@ str_to_key(unsigned char *str, unsigned char *key)
 }
 static void
-smbhash(unsigned char *out, unsigned char *in, unsigned char *key, int forw)
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key,
+        int forw)
 {
        int i;
        char *outb; /* outb[64] */
@@ -363,7 +364,7 @@ E_P16(unsigned char *p14, unsigned char *p16)
 }
 void
-E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24)
+E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
 {
        smbhash(p24, c8, p21, 1);
        smbhash(p24 + 8, c8, p21 + 7, 1);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index ff3232fa101..93fb09a99c6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -49,9 +49,10 @@
 /*The following definitions come from  libsmb/smbencrypt.c  */
-void SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
+void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
+                unsigned char *p24);
 void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-static void SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
+static void SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
                   unsigned char p24[24]);
 void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
@@ -61,7 +62,7 @@ void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
   encrypted password into p24 */
 /* Note that password must be uppercased and null terminated */
 void
-SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
+SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
 {
        unsigned char p14[15], p21[21];
@@ -212,7 +213,7 @@ ntv2_owf_gen(const unsigned char owf[16], const char *user_n,
 /* Does the des encryption from the NT or LM MD4 hash. */
 static void
-SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
+SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
              unsigned char p24[24])
 {
        unsigned char p21[21];
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ff8243a8fe3..7ebe6599ed3 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -37,15 +37,11 @@ extern mempool_t *cifs_mid_poolp;
 extern struct kmem_cache *cifs_oplock_cachep;
 static struct mid_q_entry *
-AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
+AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 {
        struct mid_q_entry *temp;
-        if (ses == NULL) {
+        if (server == NULL) {
-                cERROR(1, ("Null session passed in to AllocMidQEntry"));
-                return NULL;
-        }
-        if (ses->server == NULL) {
                cERROR(1, ("Null TCP session in AllocMidQEntry"));
                return NULL;
        }
@@ -62,12 +58,11 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
        /*      do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
                /* when mid allocated can be before when sent */
                temp->when_alloc = jiffies;
-                temp->ses = ses;
                temp->tsk = current;
        }
        spin_lock(&GlobalMid_Lock);
-        list_add_tail(&temp->qhead, &ses->server->pending_mid_q);
+        list_add_tail(&temp->qhead, &server->pending_mid_q);
        atomic_inc(&midCount);
        temp->midState = MID_REQUEST_ALLOCATED;
        spin_unlock(&GlobalMid_Lock);
@@ -349,37 +344,38 @@ static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
        if (long_op == CIFS_ASYNC_OP) {
                /* oplock breaks must not be held up */
                atomic_inc(&ses->server->inFlight);
-        } else {
+                return 0;
-                spin_lock(&GlobalMid_Lock);
+        }
-                while (1) {
-                        if (atomic_read(&ses->server->inFlight) >=
+        spin_lock(&GlobalMid_Lock);
-                                        cifs_max_pending){
+        while (1) {
-                                spin_unlock(&GlobalMid_Lock);
+                if (atomic_read(&ses->server->inFlight) >=
+                                cifs_max_pending){
+                        spin_unlock(&GlobalMid_Lock);
 #ifdef CONFIG_CIFS_STATS2
-                                atomic_inc(&ses->server->num_waiters);
+                        atomic_inc(&ses->server->num_waiters);
 #endif
-                                wait_event(ses->server->request_q,
+                        wait_event(ses->server->request_q,
-                                        atomic_read(&ses->server->inFlight)
+                                   atomic_read(&ses->server->inFlight)
-                                         < cifs_max_pending);
+                                     < cifs_max_pending);
 #ifdef CONFIG_CIFS_STATS2
-                                atomic_dec(&ses->server->num_waiters);
+                        atomic_dec(&ses->server->num_waiters);
 #endif
-                                spin_lock(&GlobalMid_Lock);
+                        spin_lock(&GlobalMid_Lock);
-                        } else {
+                } else {
-                                if (ses->server->tcpStatus == CifsExiting) {
+                        if (ses->server->tcpStatus == CifsExiting) {
-                                        spin_unlock(&GlobalMid_Lock);
-                                        return -ENOENT;
-                                }
-                                /* can not count locking commands against total
-                                   as they are allowed to block on server */
-                                /* update # of requests on the wire to server */
-                                if (long_op != CIFS_BLOCKING_OP)
-                                        atomic_inc(&ses->server->inFlight);
                                spin_unlock(&GlobalMid_Lock);
-                                break;
+                                return -ENOENT;
                        }
+                        /* can not count locking commands against total
+                           as they are allowed to block on server */
+                        /* update # of requests on the wire to server */
+                        if (long_op != CIFS_BLOCKING_OP)
+                                atomic_inc(&ses->server->inFlight);
+                        spin_unlock(&GlobalMid_Lock);
+                        break;
                }
        }
        return 0;
@@ -390,17 +386,21 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
 {
        if (ses->server->tcpStatus == CifsExiting) {
                return -ENOENT;
-        } else if (ses->server->tcpStatus == CifsNeedReconnect) {
+        }
+        if (ses->server->tcpStatus == CifsNeedReconnect) {
                cFYI(1, ("tcp session dead - return to caller to retry"));
                return -EAGAIN;
-        } else if (ses->status != CifsGood) {
+        }
+        if (ses->status != CifsGood) {
                /* check if SMB session is bad because we are setting it up */
                if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
                        (in_buf->Command != SMB_COM_NEGOTIATE))
                        return -EAGAIN;
                /* else ok - we are setting up session */
        }
-        *ppmidQ = AllocMidQEntry(in_buf, ses);
+        *ppmidQ = AllocMidQEntry(in_buf, ses->server);
        if (*ppmidQ == NULL)
                return -ENOMEM;
        return 0;
@@ -415,11 +415,8 @@ static int wait_for_response(struct cifsSesInfo *ses,
        for (;;) {
                curr_timeout = timeout + jiffies;
-                wait_event(ses->server->response_q,
+                wait_event_timeout(ses->server->response_q,
-                        (!(midQ->midState == MID_REQUEST_SUBMITTED)) ||
+                        midQ->midState != MID_REQUEST_SUBMITTED, timeout);
-                        time_after(jiffies, curr_timeout) ||
-                        ((ses->server->tcpStatus != CifsGood) &&
-                         (ses->server->tcpStatus != CifsNew)));
                if (time_after(jiffies, curr_timeout) &&
                        (midQ->midState == MID_REQUEST_SUBMITTED) &&
@@ -521,11 +518,11 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
           and avoid races inside tcp sendmsg code that could cause corruption
           of smb data */
-        down(&ses->server->tcpSem);
+        mutex_lock(&ses->server->srv_mutex);
        rc = allocate_mid(ses, in_buf, &midQ);
        if (rc) {
-                up(&ses->server->tcpSem);
+                mutex_unlock(&ses->server->srv_mutex);
                cifs_small_buf_release(in_buf);
                /* Update # of requests on wire to server */
                atomic_dec(&ses->server->inFlight);
@@ -533,6 +530,11 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                return rc;
        }
        rc = cifs_sign_smb2(iov, n_vec, ses->server, &midQ->sequence_number);
+        if (rc) {
+                mutex_unlock(&ses->server->srv_mutex);
+                cifs_small_buf_release(in_buf);
+                goto out;
+        }
        midQ->midState = MID_REQUEST_SUBMITTED;
 #ifdef CONFIG_CIFS_STATS2
@@ -546,7 +548,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        midQ->when_sent = jiffies;
 #endif
-        up(&ses->server->tcpSem);
+        mutex_unlock(&ses->server->srv_mutex);
        cifs_small_buf_release(in_buf);
        if (rc < 0)
@@ -581,10 +583,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
        wait_for_response(ses, midQ, timeout, 10 * HZ);
        spin_lock(&GlobalMid_Lock);
-        if (midQ->resp_buf) {
-                spin_unlock(&GlobalMid_Lock);
+        if (midQ->resp_buf == NULL) {
-                receive_len = midQ->resp_buf->smb_buf_length;
-        } else {
                cERROR(1, ("No response to cmd %d mid %d",
                        midQ->command, midQ->mid));
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
@@ -612,53 +612,59 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
                return rc;
        }
+        spin_unlock(&GlobalMid_Lock);
+        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
                        receive_len, xid));
                rc = -EIO;
-        } else {                /* rcvd frame is ok */
+                goto out;
-                if (midQ->resp_buf &&
+        }
-                        (midQ->midState == MID_RESPONSE_RECEIVED)) {
+        /* rcvd frame is ok */
-                        iov[0].iov_base = (char *)midQ->resp_buf;
-                        if (midQ->largeBuf)
+        if (midQ->resp_buf &&
-                                *pRespBufType = CIFS_LARGE_BUFFER;
+            (midQ->midState == MID_RESPONSE_RECEIVED)) {
-                        else
-                                *pRespBufType = CIFS_SMALL_BUFFER;
+                iov[0].iov_base = (char *)midQ->resp_buf;
-                        iov[0].iov_len = receive_len + 4;
+                if (midQ->largeBuf)
+                        *pRespBufType = CIFS_LARGE_BUFFER;
-                        dump_smb(midQ->resp_buf, 80);
+                else
-                        /* convert the length into a more usable form */
+                        *pRespBufType = CIFS_SMALL_BUFFER;
-                        if ((receive_len > 24) &&
+                iov[0].iov_len = receive_len + 4;
-                           (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                        SECMODE_SIGN_ENABLED))) {
+                dump_smb(midQ->resp_buf, 80);
-                                rc = cifs_verify_signature(midQ->resp_buf,
+                /* convert the length into a more usable form */
+                if ((receive_len > 24) &&
+                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
+                                             SECMODE_SIGN_ENABLED))) {
+                        rc = cifs_verify_signature(midQ->resp_buf,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
-                                if (rc) {
+                        if (rc) {
-                                        cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, ("Unexpected SMB signature"));
-                                        /* BB FIXME add code to kill session */
+                                /* BB FIXME add code to kill session */
-                                }
                        }
-                        /* BB special case reconnect tid and uid here? */
-                        rc = map_smb_to_linux_error(midQ->resp_buf,
-                                                flags & CIFS_LOG_ERROR);
-                        /* convert ByteCount if necessary */
-                        if (receive_len >= sizeof(struct smb_hdr) - 4
-                            /* do not count RFC1001 header */  +
-                            (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
-                                BCC(midQ->resp_buf) =
-                                        le16_to_cpu(BCC_LE(midQ->resp_buf));
-                        if ((flags & CIFS_NO_RESP) == 0)
-                                midQ->resp_buf = NULL;  /* mark it so buf will
-                                                           not be freed by
-                                                           DeleteMidQEntry */
-                } else {
-                        rc = -EIO;
-                        cFYI(1, ("Bad MID state?"));
                }
+                /* BB special case reconnect tid and uid here? */
+                rc = map_smb_to_linux_error(midQ->resp_buf,
+                                            flags & CIFS_LOG_ERROR);
+                /* convert ByteCount if necessary */
+                if (receive_len >= sizeof(struct smb_hdr) - 4
+                    /* do not count RFC1001 header */  +
+                    (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
+                        BCC(midQ->resp_buf) =
+                                le16_to_cpu(BCC_LE(midQ->resp_buf));
+                if ((flags & CIFS_NO_RESP) == 0)
+                        midQ->resp_buf = NULL;  /* mark it so buf will
+                                                   not be freed by
+                                                   DeleteMidQEntry */
+        } else {
+                rc = -EIO;
+                cFYI(1, ("Bad MID state?"));
        }
 out:
@@ -695,6 +701,12 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
           to the same server. We may make this configurable later or
           use ses->maxReq */
+        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                           in_buf->smb_buf_length));
+                return -EIO;
+        }
        rc = wait_for_free_request(ses, long_op);
        if (rc)
                return rc;
@@ -703,29 +715,22 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
           and avoid races inside tcp sendmsg code that could cause corruption
           of smb data */
-        down(&ses->server->tcpSem);
+        mutex_lock(&ses->server->srv_mutex);
        rc = allocate_mid(ses, in_buf, &midQ);
        if (rc) {
-                up(&ses->server->tcpSem);
+                mutex_unlock(&ses->server->srv_mutex);
                /* Update # of requests on wire to server */
                atomic_dec(&ses->server->inFlight);
                wake_up(&ses->server->request_q);
                return rc;
        }
-        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
-                        in_buf->smb_buf_length));
-                DeleteMidQEntry(midQ);
-                up(&ses->server->tcpSem);
-                /* Update # of requests on wire to server */
-                atomic_dec(&ses->server->inFlight);
-                wake_up(&ses->server->request_q);
-                return -EIO;
-        }
        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
+        if (rc) {
+                mutex_unlock(&ses->server->srv_mutex);
+                goto out;
+        }
        midQ->midState = MID_REQUEST_SUBMITTED;
 #ifdef CONFIG_CIFS_STATS2
@@ -738,7 +743,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
 #endif
-        up(&ses->server->tcpSem);
+        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0)
                goto out;
@@ -772,10 +777,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
        wait_for_response(ses, midQ, timeout, 10 * HZ);
        spin_lock(&GlobalMid_Lock);
-        if (midQ->resp_buf) {
+        if (midQ->resp_buf == NULL) {
-                spin_unlock(&GlobalMid_Lock);
-                receive_len = midQ->resp_buf->smb_buf_length;
-        } else {
                cERROR(1, ("No response for cmd %d mid %d",
                          midQ->command, midQ->mid));
                if (midQ->midState == MID_REQUEST_SUBMITTED) {
@@ -803,47 +805,52 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                return rc;
        }
+        spin_unlock(&GlobalMid_Lock);
+        receive_len = midQ->resp_buf->smb_buf_length;
        if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
                        receive_len, xid));
                rc = -EIO;
-        } else {                /* rcvd frame is ok */
+                goto out;
+        }
-                if (midQ->resp_buf && out_buf
-                    && (midQ->midState == MID_RESPONSE_RECEIVED)) {
+        /* rcvd frame is ok */
-                        out_buf->smb_buf_length = receive_len;
-                        memcpy((char *)out_buf + 4,
+        if (midQ->resp_buf && out_buf
-                               (char *)midQ->resp_buf + 4,
+            && (midQ->midState == MID_RESPONSE_RECEIVED)) {
-                               receive_len);
+                out_buf->smb_buf_length = receive_len;
+                memcpy((char *)out_buf + 4,
-                        dump_smb(out_buf, 92);
+                       (char *)midQ->resp_buf + 4,
-                        /* convert the length into a more usable form */
+                       receive_len);
-                        if ((receive_len > 24) &&
-                           (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
+                dump_smb(out_buf, 92);
-                                        SECMODE_SIGN_ENABLED))) {
+                /* convert the length into a more usable form */
-                                rc = cifs_verify_signature(out_buf,
+                if ((receive_len > 24) &&
+                    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
+                                             SECMODE_SIGN_ENABLED))) {
+                        rc = cifs_verify_signature(out_buf,
                                                &ses->server->mac_signing_key,
                                                midQ->sequence_number+1);
-                                if (rc) {
+                        if (rc) {
-                                        cERROR(1, ("Unexpected SMB signature"));
+                                cERROR(1, ("Unexpected SMB signature"));
-                                        /* BB FIXME add code to kill session */
+                                /* BB FIXME add code to kill session */
-                                }
                        }
+                }
-                        *pbytes_returned = out_buf->smb_buf_length;
+                *pbytes_returned = out_buf->smb_buf_length;
-                        /* BB special case reconnect tid and uid here? */
+                /* BB special case reconnect tid and uid here? */
-                        rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
+                rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-                        /* convert ByteCount if necessary */
+                /* convert ByteCount if necessary */
-                        if (receive_len >= sizeof(struct smb_hdr) - 4
+                if (receive_len >= sizeof(struct smb_hdr) - 4
-                            /* do not count RFC1001 header */  +
+                    /* do not count RFC1001 header */  +
-                            (2 * out_buf->WordCount) + 2 /* bcc */ )
+                    (2 * out_buf->WordCount) + 2 /* bcc */ )
-                                BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+                        BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
-                } else {
+        } else {
-                        rc = -EIO;
+                rc = -EIO;
-                        cERROR(1, ("Bad MID state?"));
+                cERROR(1, ("Bad MID state?"));
-                }
        }
 out:
@@ -866,16 +873,16 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
        header_assemble(in_buf, SMB_COM_NT_CANCEL, tcon, 0);
        in_buf->Mid = mid;
-        down(&ses->server->tcpSem);
+        mutex_lock(&ses->server->srv_mutex);
        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
        if (rc) {
-                up(&ses->server->tcpSem);
+                mutex_unlock(&ses->server->srv_mutex);
                return rc;
        }
        rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
              (struct sockaddr *) &(ses->server->addr.sockAddr),
              ses->server->noblocksnd);
-        up(&ses->server->tcpSem);
+        mutex_unlock(&ses->server->srv_mutex);
        return rc;
 }
@@ -933,6 +940,12 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
           to the same server. We may make this configurable later or
           use ses->maxReq */
+        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+                cERROR(1, ("Illegal length, greater than maximum frame, %d",
+                           in_buf->smb_buf_length));
+                return -EIO;
+        }
        rc = wait_for_free_request(ses, CIFS_BLOCKING_OP);
        if (rc)
                return rc;
@@ -941,24 +954,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
           and avoid races inside tcp sendmsg code that could cause corruption
           of smb data */
-        down(&ses->server->tcpSem);
+        mutex_lock(&ses->server->srv_mutex);
        rc = allocate_mid(ses, in_buf, &midQ);
        if (rc) {
-                up(&ses->server->tcpSem);
+                mutex_unlock(&ses->server->srv_mutex);
                return rc;
        }
-        if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
-                up(&ses->server->tcpSem);
+        if (rc) {
-                cERROR(1, ("Illegal length, greater than maximum frame, %d",
-                        in_buf->smb_buf_length));
                DeleteMidQEntry(midQ);
-                return -EIO;
+                mutex_unlock(&ses->server->srv_mutex);
+                return rc;
        }
-        rc = cifs_sign_smb(in_buf, ses->server, &midQ->sequence_number);
        midQ->midState = MID_REQUEST_SUBMITTED;
 #ifdef CONFIG_CIFS_STATS2
        atomic_inc(&ses->server->inSend);
@@ -970,7 +980,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
        atomic_dec(&ses->server->inSend);
        midQ->when_sent = jiffies;
 #endif
-        up(&ses->server->tcpSem);
+        mutex_unlock(&ses->server->srv_mutex);
        if (rc < 0) {
                DeleteMidQEntry(midQ);
@@ -1052,44 +1062,48 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
                        receive_len, xid));
                rc = -EIO;
-        } else {                /* rcvd frame is ok */
+                goto out;
+        }
-                if (midQ->resp_buf && out_buf
-                    && (midQ->midState == MID_RESPONSE_RECEIVED)) {
-                        out_buf->smb_buf_length = receive_len;
-                        memcpy((char *)out_buf + 4,
-                               (char *)midQ->resp_buf + 4,
-                               receive_len);
-                        dump_smb(out_buf, 92);
-                        /* convert the length into a more usable form */
-                        if ((receive_len > 24) &&
-                           (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
-                                        SECMODE_SIGN_ENABLED))) {
-                                rc = cifs_verify_signature(out_buf,
-                                                &ses->server->mac_signing_key,
-                                                midQ->sequence_number+1);
-                                if (rc) {
-                                        cERROR(1, ("Unexpected SMB signature"));
-                                        /* BB FIXME add code to kill session */
-                                }
-                        }
-                        *pbytes_returned = out_buf->smb_buf_length;
+        /* rcvd frame is ok */
-                        /* BB special case reconnect tid and uid here? */
+        if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
-                        rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
+                rc = -EIO;
+                cERROR(1, ("Bad MID state?"));
+                goto out;
+        }
-                        /* convert ByteCount if necessary */
+        out_buf->smb_buf_length = receive_len;
-                        if (receive_len >= sizeof(struct smb_hdr) - 4
+        memcpy((char *)out_buf + 4,
-                            /* do not count RFC1001 header */  +
+               (char *)midQ->resp_buf + 4,
-                            (2 * out_buf->WordCount) + 2 /* bcc */ )
+               receive_len);
-                                BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
-                } else {
+        dump_smb(out_buf, 92);
-                        rc = -EIO;
+        /* convert the length into a more usable form */
-                        cERROR(1, ("Bad MID state?"));
+        if ((receive_len > 24) &&
+            (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
+                                     SECMODE_SIGN_ENABLED))) {
+                rc = cifs_verify_signature(out_buf,
+                                           &ses->server->mac_signing_key,
+                                           midQ->sequence_number+1);
+                if (rc) {
+                        cERROR(1, ("Unexpected SMB signature"));
+                        /* BB FIXME add code to kill session */
                }
        }
+        *pbytes_returned = out_buf->smb_buf_length;
+        /* BB special case reconnect tid and uid here? */
+        rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
+        /* convert ByteCount if necessary */
+        if (receive_len >= sizeof(struct smb_hdr) - 4
+            /* do not count RFC1001 header */  +
+            (2 * out_buf->WordCount) + 2 /* bcc */ )
+                BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
+out:
        DeleteMidQEntry(midQ);
        if (rstart && rc == -EACCES)
                return -ERESTARTSYS;
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 8a2370341c7..a5bf5771a22 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -32,8 +32,8 @@ void coda_cache_enter(struct inode *inode, int mask)
        struct coda_inode_info *cii = ITOC(inode);
        cii->c_cached_epoch = atomic_read(&permission_epoch);
-        if (cii->c_uid != current->fsuid) {
+        if (cii->c_uid != current_fsuid()) {
-                cii->c_uid = current->fsuid;
+                cii->c_uid = current_fsuid();
                cii->c_cached_perm = mask;
        } else
                cii->c_cached_perm |= mask;
@@ -60,7 +60,7 @@ int coda_cache_check(struct inode *inode, int mask)
        int hit;
        
        hit = (mask & cii->c_cached_perm) == mask &&
-                cii->c_uid == current->fsuid &&
+                cii->c_uid == current_fsuid() &&
                cii->c_cached_epoch == atomic_read(&permission_epoch);
        return hit;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 29137ff3ca6..466303db2df 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/stat.h>
+#include <linux/cred.h>
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/string.h>
@@ -174,7 +175,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode),
-                          coda_flags, coda_file->f_uid);
+                          coda_flags, coda_file->f_cred->fsuid);
        host_inode = cfi->cfi_container->f_path.dentry->d_inode;
        cii = ITOC(coda_inode);
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index ce432bca95d..c274d949179 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -52,7 +52,7 @@ static void *alloc_upcall(int opcode, int size)
        inp->ih.opcode = opcode;
        inp->ih.pid = current->pid;
        inp->ih.pgid = task_pgrp_nr(current);
-        inp->ih.uid = current->fsuid;
+        inp->ih.uid = current_fsuid();
        return (void*)inp;
 }
diff --git a/fs/compat.c b/fs/compat.c
index e5f49f53850..d1ece79b641 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1393,10 +1393,20 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_ret;
+        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        if (retval < 0)
+                goto out_free;
+        retval = -ENOMEM;
+        bprm->cred = prepare_exec_creds();
+        if (!bprm->cred)
+                goto out_unlock;
+        check_unsafe_exec(bprm);
        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
-                goto out_kfree;
+                goto out_unlock;
        sched_exec();
@@ -1410,14 +1420,10 @@ int compat_do_execve(char * filename,
        bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
        if ((retval = bprm->argc) < 0)
-                goto out_mm;
+                goto out;
        bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
        if ((retval = bprm->envc) < 0)
-                goto out_mm;
-        retval = security_bprm_alloc(bprm);
-        if (retval)
                goto out;
        retval = prepare_binprm(bprm);
@@ -1438,19 +1444,16 @@ int compat_do_execve(char * filename,
                goto out;
        retval = search_binary_handler(bprm, regs);
-        if (retval >= 0) {
+        if (retval < 0)
-                /* execve success */
+                goto out;
-                security_bprm_free(bprm);
-                acct_update_integrals(current);
-                free_bprm(bprm);
-                return retval;
-        }
-out:
+        /* execve succeeded */
-        if (bprm->security)
+        mutex_unlock(&current->cred_exec_mutex);
-                security_bprm_free(bprm);
+        acct_update_integrals(current);
+        free_bprm(bprm);
+        return retval;
-out_mm:
+out:
        if (bprm->mm)
                mmput(bprm->mm);
@@ -1460,7 +1463,10 @@ out_file:
                fput(bprm->file);
        }
-out_kfree:
+out_unlock:
+        mutex_unlock(&current->cred_exec_mutex);
+out_free:
        free_bprm(bprm);
 out_ret:
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e6..e88c23b85a3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
 #include <linux/bootmem.h>
 #include "internal.h"
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        dentry->d_mounted = 0;
-#ifdef CONFIG_PROFILING
-        dentry->d_cookie = NULL;
-#endif
        INIT_HLIST_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1336,7 +1332,7 @@ err_out:
 *
 * Searches the children of the parent dentry for the name in question. If
 * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use d_put to free the entry when it has
+ * is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned on failure.
 *
 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
@@ -1620,8 +1616,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                         */
                        memcpy(dentry->d_iname, target->d_name.name,
                                        target->d_name.len + 1);
+                        dentry->d_name.len = target->d_name.len;
+                        return;
                }
        }
+        do_switch(dentry->d_name.len, target->d_name.len);
 }
 /*
@@ -1681,7 +1680,6 @@ already_unhashed:
        /* Switch the names.. */
        switch_names(dentry, target);
-        do_switch(dentry->d_name.len, target->d_name.len);
        do_switch(dentry->d_name.hash, target->d_name.hash);
        /* ... and switch the parents */
@@ -1791,7 +1789,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        struct dentry *dparent, *aparent;
        switch_names(dentry, anon);
-        do_switch(dentry->d_name.len, anon->d_name.len);
        do_switch(dentry->d_name.hash, anon->d_name.hash);
        dparent = dentry->d_parent;
@@ -1911,7 +1908,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
 *
 * "buflen" should be positive. Caller holds the dcache_lock.
 *
@@ -1987,7 +1985,10 @@ Elong:
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
@@ -2313,9 +2314,6 @@ static void __init dcache_init(void)
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
-/* SLAB cache for file structures */
-struct kmem_cache *filp_cachep __read_mostly;
 EXPORT_SYMBOL(d_genocide);
 void __init vfs_caches_init_early(void)
@@ -2337,9 +2335,6 @@ void __init vfs_caches_init(unsigned long mempages)
        names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
        dcache_init();
        inode_init();
        files_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619..180e9fec4ad 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
 {
        struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
                                                        GFP_KERNEL);
+        struct dentry *d;
        if (!dcs)
                return NULL;
-        path->dentry->d_cookie = dcs;
+        d = path->dentry;
+        spin_lock(&d->d_lock);
+        d->d_flags |= DCACHE_COOKIE;
+        spin_unlock(&d->d_lock);
        dcs->path = *path;
        path_get(path);
        hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
                goto out;
        }
-        dcs = path->dentry->d_cookie;
+        if (path->dentry->d_flags & DCACHE_COOKIE) {
+                dcs = find_dcookie((unsigned long)path->dentry);
-        if (!dcs)
+        } else {
                dcs = alloc_dcookie(path);
+                if (!dcs) {
-        if (!dcs) {
+                        err = -ENOMEM;
-                err = -ENOMEM;
+                        goto out;
-                goto out;
+                }
        }
        *cookie = dcookie_value(dcs);
@@ -251,7 +256,12 @@ out_kmem:
 static void free_dcookie(struct dcookie_struct * dcs)
 {
-        dcs->path.dentry->d_cookie = NULL;
+        struct dentry *d = dcs->path.dentry;
+        spin_lock(&d->d_lock);
+        d->d_flags &= ~DCACHE_COOKIE;
+        spin_unlock(&d->d_lock);
        path_put(&dcs->path);
        kmem_cache_free(dcookie_cache, dcs);
 }
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 4a714f6c1be..fff96e152c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,25 +27,32 @@
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 #define DEVPTS_DEFAULT_MODE 0600
+/*
+ * ptmx is a new node in /dev/pts and will be unused in legacy (single-
+ * instance) mode. To prevent surprises in user space, set permissions of
+ * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
+ * permissions.
+ */
+#define DEVPTS_DEFAULT_PTMX_MODE 0000
 #define PTMX_MINOR      2
 extern int pty_limit;                   /* Config limit on Unix98 ptys */
-static DEFINE_IDA(allocated_ptys);
 static DEFINE_MUTEX(allocated_ptys_lock);
 static struct vfsmount *devpts_mnt;
-static struct dentry *devpts_root;
-static struct {
+struct pts_mount_opts {
        int setuid;
        int setgid;
        uid_t   uid;
        gid_t   gid;
        umode_t mode;
-} config = {.mode = DEVPTS_DEFAULT_MODE};
+        umode_t ptmxmode;
+        int newinstance;
+};
 enum {
-        Opt_uid, Opt_gid, Opt_mode,
+        Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
        Opt_err
 };
@@ -53,18 +60,50 @@ static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%o"},
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        {Opt_ptmxmode, "ptmxmode=%o"},
+        {Opt_newinstance, "newinstance"},
+#endif
        {Opt_err, NULL}
 };
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+struct pts_fs_info {
+        struct ida allocated_ptys;
+        struct pts_mount_opts mount_opts;
+        struct dentry *ptmx_dentry;
+};
+static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+{
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+                return inode->i_sb;
+#endif
+        return devpts_mnt->mnt_sb;
+}
+#define PARSE_MOUNT     0
+#define PARSE_REMOUNT   1
+static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 {
        char *p;
-        config.setuid  = 0;
+        opts->setuid  = 0;
-        config.setgid  = 0;
+        opts->setgid  = 0;
-        config.uid     = 0;
+        opts->uid     = 0;
-        config.gid     = 0;
+        opts->gid     = 0;
-        config.mode    = DEVPTS_DEFAULT_MODE;
+        opts->mode    = DEVPTS_DEFAULT_MODE;
+        opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+        /* newinstance makes sense only on initial mount */
+        if (op == PARSE_MOUNT)
+                opts->newinstance = 0;
        while ((p = strsep(&data, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
@@ -79,20 +118,32 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
                case Opt_uid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
-                        config.uid = option;
+                        opts->uid = option;
-                        config.setuid = 1;
+                        opts->setuid = 1;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
-                        config.gid = option;
+                        opts->gid = option;
-                        config.setgid = 1;
+                        opts->setgid = 1;
                        break;
                case Opt_mode:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
-                        config.mode = option & S_IALLUGO;
+                        opts->mode = option & S_IALLUGO;
+                        break;
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+                case Opt_ptmxmode:
+                        if (match_octal(&args[0], &option))
+                                return -EINVAL;
+                        opts->ptmxmode = option & S_IALLUGO;
+                        break;
+                case Opt_newinstance:
+                        /* newinstance makes sense only on initial mount */
+                        if (op == PARSE_MOUNT)
+                                opts->newinstance = 1;
                        break;
+#endif
                default:
                        printk(KERN_ERR "devpts: called with bogus options\n");
                        return -EINVAL;
@@ -102,13 +153,108 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int mknod_ptmx(struct super_block *sb)
+{
+        int mode;
+        int rc = -ENOMEM;
+        struct dentry *dentry;
+        struct inode *inode;
+        struct dentry *root = sb->s_root;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
+        mutex_lock(&root->d_inode->i_mutex);
+        /* If we have already created ptmx node, return */
+        if (fsi->ptmx_dentry) {
+                rc = 0;
+                goto out;
+        }
+        dentry = d_alloc_name(root, "ptmx");
+        if (!dentry) {
+                printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
+                goto out;
+        }
+        /*
+         * Create a new 'ptmx' node in this mount of devpts.
+         */
+        inode = new_inode(sb);
+        if (!inode) {
+                printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
+                dput(dentry);
+                goto out;
+        }
+        inode->i_ino = 2;
+        inode->i_uid = inode->i_gid = 0;
+        inode->i_blocks = 0;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        mode = S_IFCHR|opts->ptmxmode;
+        init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+        d_add(dentry, inode);
+        fsi->ptmx_dentry = dentry;
+        rc = 0;
+        printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
+                        inode->i_ino);
+out:
+        mutex_unlock(&root->d_inode->i_mutex);
+        return rc;
+}
+static void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+        struct inode *inode;
+        if (fsi->ptmx_dentry) {
+                inode = fsi->ptmx_dentry->d_inode;
+                inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
+        }
+}
+#else
+static inline void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+       return;
+}
+#endif
+static int devpts_remount(struct super_block *sb, int *flags, char *data)
+{
+        int err;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
+        err = parse_mount_options(data, PARSE_REMOUNT, opts);
+        /*
+         * parse_mount_options() restores options to default values
+         * before parsing and may have changed ptmxmode. So, update the
+         * mode in the inode too. Bogus options don't fail the remount,
+         * so do this even on error return.
+         */
+        update_ptmx_mode(fsi);
+        return err;
+}
 static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-        if (config.setuid)
+        struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
-                seq_printf(seq, ",uid=%u", config.uid);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
-        if (config.setgid)
-                seq_printf(seq, ",gid=%u", config.gid);
+        if (opts->setuid)
-        seq_printf(seq, ",mode=%03o", config.mode);
+                seq_printf(seq, ",uid=%u", opts->uid);
+        if (opts->setgid)
+                seq_printf(seq, ",gid=%u", opts->gid);
+        seq_printf(seq, ",mode=%03o", opts->mode);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+#endif
        return 0;
 }
@@ -119,10 +265,25 @@ static const struct super_operations devpts_sops = {
        .show_options   = devpts_show_options,
 };
+static void *new_pts_fs_info(void)
+{
+        struct pts_fs_info *fsi;
+        fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+        if (!fsi)
+                return NULL;
+        ida_init(&fsi->allocated_ptys);
+        fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+        fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+        return fsi;
+}
 static int
 devpts_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct inode * inode;
+        struct inode *inode;
        s->s_blocksize = 1024;
        s->s_blocksize_bits = 10;
@@ -130,9 +291,13 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        s->s_op = &devpts_sops;
        s->s_time_gran = 1;
+        s->s_fs_info = new_pts_fs_info();
+        if (!s->s_fs_info)
+                goto fail;
        inode = new_inode(s);
        if (!inode)
-                goto fail;
+                goto free_fsi;
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_blocks = 0;
@@ -142,27 +307,226 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        inode->i_fop = &simple_dir_operations;
        inode->i_nlink = 2;
-        devpts_root = s->s_root = d_alloc_root(inode);
+        s->s_root = d_alloc_root(inode);
        if (s->s_root)
                return 0;
-        
-        printk("devpts: get root dentry failed\n");
+        printk(KERN_ERR "devpts: get root dentry failed\n");
        iput(inode);
+free_fsi:
+        kfree(s->s_fs_info);
 fail:
        return -ENOMEM;
 }
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+        if (devpts_mnt)
+                return devpts_mnt->mnt_sb == s;
+        return 0;
+}
+/*
+ * Safely parse the mount options in @data and update @opts.
+ *
+ * devpts ends up parsing options two times during mount, due to the
+ * two modes of operation it supports. The first parse occurs in
+ * devpts_get_sb() when determining the mode (single-instance or
+ * multi-instance mode). The second parse happens in devpts_remount()
+ * or new_pts_mount() depending on the mode.
+ *
+ * Parsing of options modifies the @data making subsequent parsing
+ * incorrect. So make a local copy of @data and parse it.
+ *
+ * Return: 0 On success, -errno on error
+ */
+static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
+{
+        int rc;
+        void *datacp;
+        if (!data)
+                return 0;
+        /* Use kstrdup() ?  */
+        datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!datacp)
+                return -ENOMEM;
+        memcpy(datacp, data, PAGE_SIZE);
+        rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
+        kfree(datacp);
+        return rc;
+}
+/*
+ * Mount a new (private) instance of devpts.  PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
+ */
+static int new_pts_mount(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        int err;
+        struct pts_fs_info *fsi;
+        struct pts_mount_opts *opts;
+        printk(KERN_NOTICE "devpts: newinstance mount\n");
+        err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
+        if (err)
+                return err;
+        fsi = DEVPTS_SB(mnt->mnt_sb);
+        opts = &fsi->mount_opts;
+        err = parse_mount_options(data, PARSE_MOUNT, opts);
+        if (err)
+                goto fail;
+        err = mknod_ptmx(mnt->mnt_sb);
+        if (err)
+                goto fail;
+        return 0;
+fail:
+        dput(mnt->mnt_sb->s_root);
+        deactivate_super(mnt->mnt_sb);
+        return err;
+}
+/*
+ * Check if 'newinstance' mount option was specified in @data.
+ *
+ * Return: -errno       on error (eg: invalid mount options specified)
+ *       : 1            if 'newinstance' mount option was specified
+ *       : 0            if 'newinstance' mount option was NOT specified
+ */
+static int is_new_instance_mount(void *data)
+{
+        int rc;
+        struct pts_mount_opts opts;
+        if (!data)
+                return 0;
+        rc = safe_parse_mount_options(data, &opts);
+        if (!rc)
+                rc = opts.newinstance;
+        return rc;
+}
+/*
+ * get_init_pts_sb()
+ *
+ *     This interface is needed to support multiple namespace semantics in
+ *     devpts while preserving backward compatibility of the current 'single-
+ *     namespace' semantics. i.e all mounts of devpts without the 'newinstance'
+ *     mount option should bind to the initial kernel mount, like
+ *     get_sb_single().
+ *
+ *     Mounts with 'newinstance' option create a new private namespace.
+ *
+ *     But for single-mount semantics, devpts cannot use get_sb_single(),
+ *     because get_sb_single()/sget() find and use the super-block from
+ *     the most recent mount of devpts. But that recent mount may be a
+ *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     super-block instead of the initial super-block.
+ *
+ *     This interface is identical to get_sb_single() except that it
+ *     consistently selects the 'single-namespace' superblock even in the
+ *     presence of the private namespace (i.e 'newinstance') super-blocks.
+ */
+static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        struct super_block *s;
+        int error;
+        s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+        if (IS_ERR(s))
+                return PTR_ERR(s);
+        if (!s->s_root) {
+                s->s_flags = flags;
+                error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        return error;
+                }
+                s->s_flags |= MS_ACTIVE;
+        }
+        do_remount_sb(s, flags, data, 0);
+        return simple_set_mnt(mnt, s);
+}
+/*
+ * Mount or remount the initial kernel mount of devpts. This type of
+ * mount maintains the legacy, single-instance semantics, while the
+ * kernel still allows multiple-instances.
+ */
+static int init_pts_mount(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        int err;
+        err = get_init_pts_sb(fs_type, flags, data, mnt);
+        if (err)
+                return err;
+        err = mknod_ptmx(mnt->mnt_sb);
+        if (err) {
+                dput(mnt->mnt_sb->s_root);
+                deactivate_super(mnt->mnt_sb);
+        }
+        return err;
+}
 static int devpts_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
+        int new;
+        new = is_new_instance_mount(data);
+        if (new < 0)
+                return new;
+        if (new)
+                return new_pts_mount(fs_type, flags, data, mnt);
+        return init_pts_mount(fs_type, flags, data, mnt);
+}
+#else
+/*
+ * This supports only the legacy single-instance semantics (no
+ * multiple-instance semantics)
+ */
+static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+                const char *dev_name, void *data, struct vfsmount *mnt)
+{
        return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
+#endif
+static void devpts_kill_sb(struct super_block *sb)
+{
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        kfree(fsi);
+        kill_litter_super(sb);
+}
 static struct file_system_type devpts_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "devpts",
        .get_sb         = devpts_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = devpts_kill_sb,
 };
 /*
@@ -172,16 +536,17 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        int index;
        int ida_ret;
 retry:
-        if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
+        if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
                return -ENOMEM;
-        }
        mutex_lock(&allocated_ptys_lock);
-        ida_ret = ida_get_new(&allocated_ptys, &index);
+        ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
        if (ida_ret < 0) {
                mutex_unlock(&allocated_ptys_lock);
                if (ida_ret == -EAGAIN)
@@ -190,7 +555,7 @@ retry:
        }
        if (index >= pty_limit) {
-                ida_remove(&allocated_ptys, index);
+                ida_remove(&fsi->allocated_ptys, index);
                mutex_unlock(&allocated_ptys_lock);
                return -EIO;
        }
@@ -200,18 +565,26 @@ retry:
 void devpts_kill_index(struct inode *ptmx_inode, int idx)
 {
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        mutex_lock(&allocated_ptys_lock);
-        ida_remove(&allocated_ptys, idx);
+        ida_remove(&fsi->allocated_ptys, idx);
        mutex_unlock(&allocated_ptys_lock);
 }
 int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 {
-        int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
+        /* tty layer puts index from devpts_new_index() in here */
+        int number = tty->index;
        struct tty_driver *driver = tty->driver;
        dev_t device = MKDEV(driver->major, driver->minor_start+number);
        struct dentry *dentry;
-        struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct inode *inode = new_inode(sb);
+        struct dentry *root = sb->s_root;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
        char s[12];
        /* We're supposed to be given the slave end of a pty */
@@ -221,25 +594,25 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
        if (!inode)
                return -ENOMEM;
-        inode->i_ino = number+2;
+        inode->i_ino = number + 3;
-        inode->i_uid = config.setuid ? config.uid : current->fsuid;
+        inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
-        inode->i_gid = config.setgid ? config.gid : current->fsgid;
+        inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        init_special_inode(inode, S_IFCHR|config.mode, device);
+        init_special_inode(inode, S_IFCHR|opts->mode, device);
        inode->i_private = tty;
        tty->driver_data = inode;
        sprintf(s, "%d", number);
-        mutex_lock(&devpts_root->d_inode->i_mutex);
+        mutex_lock(&root->d_inode->i_mutex);
-        dentry = d_alloc_name(devpts_root, s);
+        dentry = d_alloc_name(root, s);
        if (!IS_ERR(dentry)) {
                d_add(dentry, inode);
-                fsnotify_create(devpts_root->d_inode, dentry);
+                fsnotify_create(root->d_inode, dentry);
        }
-        mutex_unlock(&devpts_root->d_inode->i_mutex);
+        mutex_unlock(&root->d_inode->i_mutex);
        return 0;
 }
@@ -256,20 +629,27 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 void devpts_pty_kill(struct tty_struct *tty)
 {
        struct inode *inode = tty->driver_data;
+        struct super_block *sb = pts_sb_from_inode(inode);
+        struct dentry *root = sb->s_root;
        struct dentry *dentry;
        BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
-        mutex_lock(&devpts_root->d_inode->i_mutex);
+        mutex_lock(&root->d_inode->i_mutex);
        dentry = d_find_alias(inode);
-        if (dentry && !IS_ERR(dentry)) {
+        if (IS_ERR(dentry))
+                goto out;
+        if (dentry) {
                inode->i_nlink--;
                d_delete(dentry);
-                dput(dentry);
+                dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
        }
-        mutex_unlock(&devpts_root->d_inode->i_mutex);
+        dput(dentry);           /* d_find_alias above */
+out:
+        mutex_unlock(&root->d_inode->i_mutex);
 }
 static int __init init_devpts_fs(void)
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 18bda83cc89..aa2a5775a02 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -127,8 +127,8 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
 void dlm_timeout_warn(struct dlm_lkb *lkb)
 {
+        struct sk_buff *uninitialized_var(send_skb);
        struct dlm_lock_data *data;
-        struct sk_buff *send_skb;
        size_t size;
        int rv;
diff --git a/fs/dquot.c b/fs/dquot.c
index 5e95261005b..c237ccc8581 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -874,7 +874,7 @@ static inline int need_print_warning(struct dquot *dquot)
        switch (dquot->dq_type) {
                case USRQUOTA:
-                        return current->fsuid == dquot->dq_id;
+                        return current_fsuid() == dquot->dq_id;
                case GRPQUOTA:
                        return in_group_p(dquot->dq_id);
        }
@@ -981,7 +981,7 @@ static void send_warning(const struct dquot *dquot, const char warntype)
                MINOR(dquot->dq_sb->s_dev));
        if (ret)
                goto attr_err_out;
-        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current->user->uid);
+        ret = nla_put_u64(skb, QUOTA_NL_A_CAUSED_ID, current_uid());
        if (ret)
                goto attr_err_out;
        genlmsg_end(skb, msg_head);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 3504cf9df35..a75026d35d1 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -691,7 +691,8 @@ int ecryptfs_init_kthread(void);
 void ecryptfs_destroy_kthread(void);
 int ecryptfs_privileged_open(struct file **lower_file,
                             struct dentry *lower_dentry,
-                             struct vfsmount *lower_mnt);
+                             struct vfsmount *lower_mnt,
+                             const struct cred *cred);
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f00f9c..5e78fc17988 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -673,10 +673,11 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
                        "dentry->d_name.name = [%s]\n", dentry->d_name.name);
        rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
-        buf[rc] = '\0';
        set_fs(old_fs);
        if (rc < 0)
                goto out_free;
+        else
+                buf[rc] = '\0';
        rc = 0;
        nd_set_link(nd, buf);
        goto out;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index c440c6b58b2..c6d7a4d748a 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -73,7 +73,7 @@ static int ecryptfs_threadfn(void *ignored)
                                mntget(req->lower_mnt);
                                (*req->lower_file) = dentry_open(
                                        req->lower_dentry, req->lower_mnt,
-                                        (O_RDWR | O_LARGEFILE));
+                                        (O_RDWR | O_LARGEFILE), current_cred());
                                req->flags |= ECRYPTFS_REQ_PROCESSED;
                        }
                        wake_up(&req->wait);
@@ -132,7 +132,8 @@ void ecryptfs_destroy_kthread(void)
 */
 int ecryptfs_privileged_open(struct file **lower_file,
                             struct dentry *lower_dentry,
-                             struct vfsmount *lower_mnt)
+                             struct vfsmount *lower_mnt,
+                             const struct cred *cred)
 {
        struct ecryptfs_open_req *req;
        int rc = 0;
@@ -143,7 +144,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
        dget(lower_dentry);
        mntget(lower_mnt);
        (*lower_file) = dentry_open(lower_dentry, lower_mnt,
-                                    (O_RDWR | O_LARGEFILE));
+                                    (O_RDWR | O_LARGEFILE), cred);
        if (!IS_ERR(*lower_file))
                goto out;
        req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
@@ -184,7 +185,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
                dget(lower_dentry);
                mntget(lower_mnt);
                (*lower_file) = dentry_open(lower_dentry, lower_mnt,
-                                            (O_RDONLY | O_LARGEFILE));
+                                            (O_RDONLY | O_LARGEFILE), cred);
                if (IS_ERR(*lower_file)) {
                        rc = PTR_ERR(*req->lower_file);
                        (*lower_file) = NULL;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 64d2ba980df..fd630713c5c 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -115,6 +115,7 @@ void __ecryptfs_printk(const char *fmt, ...)
 */
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 {
+        const struct cred *cred = current_cred();
        struct ecryptfs_inode_info *inode_info =
                ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
        int rc = 0;
@@ -127,7 +128,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
                lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
                rc = ecryptfs_privileged_open(&inode_info->lower_file,
-                                                     lower_dentry, lower_mnt);
+                                              lower_dentry, lower_mnt, cred);
                if (rc || IS_ERR(inode_info->lower_file)) {
                        printk(KERN_ERR "Error opening lower persistent file "
                               "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index c6983978a31..6913f727624 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -360,7 +360,8 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
        struct ecryptfs_msg_ctx *msg_ctx;
        size_t msg_size;
        struct nsproxy *nsproxy;
-        struct user_namespace *current_user_ns;
+        struct user_namespace *tsk_user_ns;
+        uid_t ctx_euid;
        int rc;
        if (msg->index >= ecryptfs_message_buf_len) {
@@ -384,9 +385,9 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
                mutex_unlock(&ecryptfs_daemon_hash_mux);
                goto wake_up;
        }
-        current_user_ns = nsproxy->user_ns;
+        tsk_user_ns = __task_cred(msg_ctx->task)->user->user_ns;
-        rc = ecryptfs_find_daemon_by_euid(&daemon, msg_ctx->task->euid,
+        ctx_euid = task_euid(msg_ctx->task);
-                                          current_user_ns);
+        rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
        rcu_read_unlock();
        mutex_unlock(&ecryptfs_daemon_hash_mux);
        if (rc) {
@@ -394,28 +395,28 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
                printk(KERN_WARNING "%s: User [%d] received a "
                       "message response from process [0x%p] but does "
                       "not have a registered daemon\n", __func__,
-                       msg_ctx->task->euid, pid);
+                       ctx_euid, pid);
                goto wake_up;
        }
-        if (msg_ctx->task->euid != euid) {
+        if (ctx_euid != euid) {
                rc = -EBADMSG;
                printk(KERN_WARNING "%s: Received message from user "
                       "[%d]; expected message from user [%d]\n", __func__,
-                       euid, msg_ctx->task->euid);
+                       euid, ctx_euid);
                goto unlock;
        }
-        if (current_user_ns != user_ns) {
+        if (tsk_user_ns != user_ns) {
                rc = -EBADMSG;
                printk(KERN_WARNING "%s: Received message from user_ns "
                       "[0x%p]; expected message from user_ns [0x%p]\n",
-                       __func__, user_ns, nsproxy->user_ns);
+                       __func__, user_ns, tsk_user_ns);
                goto unlock;
        }
        if (daemon->pid != pid) {
                rc = -EBADMSG;
                printk(KERN_ERR "%s: User [%d] sent a message response "
                       "from an unrecognized process [0x%p]\n",
-                       __func__, msg_ctx->task->euid, pid);
+                       __func__, ctx_euid, pid);
                goto unlock;
        }
        if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) {
@@ -464,14 +465,14 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
                             struct ecryptfs_msg_ctx **msg_ctx)
 {
        struct ecryptfs_daemon *daemon;
+        uid_t euid = current_euid();
        int rc;
-        rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-                                          current->nsproxy->user_ns);
        if (rc || !daemon) {
                rc = -ENOTCONN;
                printk(KERN_ERR "%s: User [%d] does not have a daemon "
-                       "registered\n", __func__, current->euid);
+                       "registered\n", __func__, euid);
                goto out;
        }
        mutex_lock(&ecryptfs_msg_ctx_lists_mux);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index b484792a099..efd95a0ed1e 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -42,12 +42,12 @@ ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
 {
        struct ecryptfs_daemon *daemon;
        unsigned int mask = 0;
+        uid_t euid = current_euid();
        int rc;
        mutex_lock(&ecryptfs_daemon_hash_mux);
        /* TODO: Just use file->private_data? */
-        rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-                                          current->nsproxy->user_ns);
        BUG_ON(rc || !daemon);
        mutex_lock(&daemon->mux);
        mutex_unlock(&ecryptfs_daemon_hash_mux);
@@ -83,6 +83,7 @@ static int
 ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 {
        struct ecryptfs_daemon *daemon = NULL;
+        uid_t euid = current_euid();
        int rc;
        mutex_lock(&ecryptfs_daemon_hash_mux);
@@ -93,11 +94,9 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
                       "count; rc = [%d]\n", __func__, rc);
                goto out_unlock_daemon_list;
        }
-        rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-                                          current->nsproxy->user_ns);
        if (rc || !daemon) {
-                rc = ecryptfs_spawn_daemon(&daemon, current->euid,
+                rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(),
-                                           current->nsproxy->user_ns,
                                           task_pid(current));
                if (rc) {
                        printk(KERN_ERR "%s: Error attempting to spawn daemon; "
@@ -147,11 +146,11 @@ static int
 ecryptfs_miscdev_release(struct inode *inode, struct file *file)
 {
        struct ecryptfs_daemon *daemon = NULL;
+        uid_t euid = current_euid();
        int rc;
        mutex_lock(&ecryptfs_daemon_hash_mux);
-        rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-                                          current->nsproxy->user_ns);
        BUG_ON(rc || !daemon);
        mutex_lock(&daemon->mux);
        BUG_ON(daemon->pid != task_pid(current));
@@ -246,12 +245,12 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
        char packet_length[3];
        size_t i;
        size_t total_length;
+        uid_t euid = current_euid();
        int rc;
        mutex_lock(&ecryptfs_daemon_hash_mux);
        /* TODO: Just use file->private_data? */
-        rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid,
+        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-                                          current->nsproxy->user_ns);
        BUG_ON(rc || !daemon);
        mutex_lock(&daemon->mux);
        if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
@@ -290,8 +289,8 @@ check_list:
                 * message from the queue; try again */
                goto check_list;
        }
-        BUG_ON(current->euid != daemon->euid);
+        BUG_ON(euid != daemon->euid);
-        BUG_ON(current->nsproxy->user_ns != daemon->user_ns);
+        BUG_ON(current_user_ns() != daemon->user_ns);
        BUG_ON(task_pid(current) != daemon->pid);
        msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue,
                                   struct ecryptfs_msg_ctx, daemon_out_list);
@@ -414,6 +413,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        size_t packet_size, packet_size_length, i;
        ssize_t sz = 0;
        char *data;
+        uid_t euid = current_euid();
        int rc;
        if (count == 0)
@@ -463,8 +463,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
                        goto out_free;
                }
                rc = ecryptfs_miscdev_response(&data[i], packet_size,
-                                               current->euid,
+                                               euid, current_user_ns(),
-                                               current->nsproxy->user_ns,
                                               task_pid(current), seq);
                if (rc)
                        printk(KERN_WARNING "%s: Failed to deliver miscdev "
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac..46cec2b6979 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
        loff_t prev_page_end_size;
        int rc = 0;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/exec.c b/fs/exec.c
index 4e834f16d9d..3ef9cf9b187 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,11 +55,7 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+#include "internal.h"
-#ifdef __alpha__
-/* for /sbin/loader handling in search_binary_handler() */
-#include <linux/a.out.h>
-#endif
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
@@ -126,7 +122,8 @@ asmlinkage long sys_uselib(const char __user * library)
        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+        error = inode_permission(nd.path.dentry->d_inode,
+                                 MAY_READ | MAY_EXEC | MAY_OPEN);
        if (error)
                goto exit;
@@ -679,7 +676,7 @@ struct file *open_exec(const char *name)
        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                goto out_path_put;
-        err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+        err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
        if (err)
                goto out_path_put;
@@ -772,7 +769,6 @@ static int de_thread(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;
-        struct task_struct *leader = NULL;
        int count;
        if (thread_group_empty(tsk))
@@ -810,7 +806,7 @@ static int de_thread(struct task_struct *tsk)
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
-                leader = tsk->group_leader;
+                struct task_struct *leader = tsk->group_leader;
                sig->notify_count = -1; /* for exit_notify() */
                for (;;) {
@@ -862,8 +858,9 @@ static int de_thread(struct task_struct *tsk)
                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
                write_unlock_irq(&tasklist_lock);
+                release_task(leader);
        }
        sig->group_exit_task = NULL;
@@ -872,8 +869,6 @@ static int de_thread(struct task_struct *tsk)
 no_thread_group:
        exit_itimers(sig);
        flush_itimer_signals();
-        if (leader)
-                release_task(leader);
        if (atomic_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
@@ -980,7 +975,7 @@ int flush_old_exec(struct linux_binprm * bprm)
        /* This is the point of no return */
        current->sas_ss_sp = current->sas_ss_size = 0;
-        if (current->euid == current->uid && current->egid == current->gid)
+        if (current_euid() == current_uid() && current_egid() == current_gid())
                set_dumpable(current->mm, 1);
        else
                set_dumpable(current->mm, suid_dumpable);
@@ -1007,16 +1002,17 @@ int flush_old_exec(struct linux_binprm * bprm)
         */
        current->mm->task_size = TASK_SIZE;
-        if (bprm->e_uid != current->euid || bprm->e_gid != current->egid) {
+        /* install the new credentials */
-                suid_keys(current);
+        if (bprm->cred->uid != current_euid() ||
-                set_dumpable(current->mm, suid_dumpable);
+            bprm->cred->gid != current_egid()) {
                current->pdeath_signal = 0;
        } else if (file_permission(bprm->file, MAY_READ) ||
-                        (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
+                   bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
-                suid_keys(current);
                set_dumpable(current->mm, suid_dumpable);
        }
+        current->personality &= ~bprm->per_clear;
        /* An exec changes our domain. We are no longer part of the thread
           group */
@@ -1033,13 +1029,50 @@ out:
 EXPORT_SYMBOL(flush_old_exec);
+/*
+ * install the new credentials for this executable
+ */
+void install_exec_creds(struct linux_binprm *bprm)
+{
+        security_bprm_committing_creds(bprm);
+        commit_creds(bprm->cred);
+        bprm->cred = NULL;
+        /* cred_exec_mutex must be held at least to this point to prevent
+         * ptrace_attach() from altering our determination of the task's
+         * credentials; any time after this it may be unlocked */
+        security_bprm_committed_creds(bprm);
+}
+EXPORT_SYMBOL(install_exec_creds);
+/*
+ * determine how safe it is to execute the proposed program
+ * - the caller must hold current->cred_exec_mutex to protect against
+ *   PTRACE_ATTACH
+ */
+void check_unsafe_exec(struct linux_binprm *bprm)
+{
+        struct task_struct *p = current;
+        bprm->unsafe = tracehook_unsafe_exec(p);
+        if (atomic_read(&p->fs->count) > 1 ||
+            atomic_read(&p->files->count) > 1 ||
+            atomic_read(&p->sighand->count) > 1)
+                bprm->unsafe |= LSM_UNSAFE_SHARE;
+}
 /* 
 * Fill the binprm structure from the inode. 
 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ *
+ * This may be called multiple times for binary chains (scripts for example).
 */
 int prepare_binprm(struct linux_binprm *bprm)
 {
-        int mode;
+        umode_t mode;
        struct inode * inode = bprm->file->f_path.dentry->d_inode;
        int retval;
@@ -1047,14 +1080,15 @@ int prepare_binprm(struct linux_binprm *bprm)
        if (bprm->file->f_op == NULL)
                return -EACCES;
-        bprm->e_uid = current->euid;
+        /* clear any previous set[ug]id data from a previous binary */
-        bprm->e_gid = current->egid;
+        bprm->cred->euid = current_euid();
+        bprm->cred->egid = current_egid();
-        if(!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
+        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
                /* Set-uid? */
                if (mode & S_ISUID) {
-                        current->personality &= ~PER_CLEAR_ON_SETID;
+                        bprm->per_clear |= PER_CLEAR_ON_SETID;
-                        bprm->e_uid = inode->i_uid;
+                        bprm->cred->euid = inode->i_uid;
                }
                /* Set-gid? */
@@ -1064,52 +1098,23 @@ int prepare_binprm(struct linux_binprm *bprm)
                 * executable.
                 */
                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-                        current->personality &= ~PER_CLEAR_ON_SETID;
+                        bprm->per_clear |= PER_CLEAR_ON_SETID;
-                        bprm->e_gid = inode->i_gid;
+                        bprm->cred->egid = inode->i_gid;
                }
        }
        /* fill in binprm security blob */
-        retval = security_bprm_set(bprm);
+        retval = security_bprm_set_creds(bprm);
        if (retval)
                return retval;
+        bprm->cred_prepared = 1;
-        memset(bprm->buf,0,BINPRM_BUF_SIZE);
+        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
-        return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE);
+        return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
 }
 EXPORT_SYMBOL(prepare_binprm);
-static int unsafe_exec(struct task_struct *p)
-{
-        int unsafe = tracehook_unsafe_exec(p);
-        if (atomic_read(&p->fs->count) > 1 ||
-            atomic_read(&p->files->count) > 1 ||
-            atomic_read(&p->sighand->count) > 1)
-                unsafe |= LSM_UNSAFE_SHARE;
-        return unsafe;
-}
-void compute_creds(struct linux_binprm *bprm)
-{
-        int unsafe;
-        if (bprm->e_uid != current->uid) {
-                suid_keys(current);
-                current->pdeath_signal = 0;
-        }
-        exec_keys(current);
-        task_lock(current);
-        unsafe = unsafe_exec(current);
-        security_bprm_apply_creds(bprm, unsafe);
-        task_unlock(current);
-        security_bprm_post_apply_creds(bprm);
-}
-EXPORT_SYMBOL(compute_creds);
 /*
 * Arguments are '\0' separated strings found at the location bprm->p
 * points to; chop off the first by relocating brpm->p to right after
@@ -1159,43 +1164,10 @@ EXPORT_SYMBOL(remove_arg_zero);
 */
 int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 {
+        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        struct linux_binfmt *fmt;
-#ifdef __alpha__
-        /* handle /sbin/loader.. */
-        {
-            struct exec * eh = (struct exec *) bprm->buf;
-            if (!bprm->loader && eh->fh.f_magic == 0x183 &&
-                (eh->fh.f_flags & 0x3000) == 0x3000)
-            {
-                struct file * file;
-                unsigned long loader;
-                allow_write_access(bprm->file);
-                fput(bprm->file);
-                bprm->file = NULL;
-                loader = bprm->vma->vm_end - sizeof(void *);
-                file = open_exec("/sbin/loader");
-                retval = PTR_ERR(file);
-                if (IS_ERR(file))
-                        return retval;
-                /* Remember if the application is TASO.  */
-                bprm->taso = eh->ah.entry < 0x100000000UL;
-                bprm->file = file;
-                bprm->loader = loader;
-                retval = prepare_binprm(bprm);
-                if (retval<0)
-                        return retval;
-                /* should call search_binary_handler recursively here,
-                   but it does not matter */
-            }
-        }
-#endif
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
@@ -1219,8 +1191,15 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                                continue;
                        read_unlock(&binfmt_lock);
                        retval = fn(bprm, regs);
+                        /*
+                         * Restore the depth counter to its starting value
+                         * in this call, so we don't have to rely on every
+                         * load_binary function to restore it on return.
+                         */
+                        bprm->recursion_depth = depth;
                        if (retval >= 0) {
-                                tracehook_report_exec(fmt, bprm, regs);
+                                if (depth == 0)
+                                        tracehook_report_exec(fmt, bprm, regs);
                                put_binfmt(fmt);
                                allow_write_access(bprm->file);
                                if (bprm->file)
@@ -1262,6 +1241,8 @@ EXPORT_SYMBOL(search_binary_handler);
 void free_bprm(struct linux_binprm *bprm)
 {
        free_arg_pages(bprm);
+        if (bprm->cred)
+                abort_creds(bprm->cred);
        kfree(bprm);
 }
@@ -1287,10 +1268,20 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
+        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        if (retval < 0)
+                goto out_free;
+        retval = -ENOMEM;
+        bprm->cred = prepare_exec_creds();
+        if (!bprm->cred)
+                goto out_unlock;
+        check_unsafe_exec(bprm);
        file = open_exec(filename);
        retval = PTR_ERR(file);
        if (IS_ERR(file))
-                goto out_kfree;
+                goto out_unlock;
        sched_exec();
@@ -1304,14 +1295,10 @@ int do_execve(char * filename,
        bprm->argc = count(argv, MAX_ARG_STRINGS);
        if ((retval = bprm->argc) < 0)
-                goto out_mm;
+                goto out;
        bprm->envc = count(envp, MAX_ARG_STRINGS);
        if ((retval = bprm->envc) < 0)
-                goto out_mm;
-        retval = security_bprm_alloc(bprm);
-        if (retval)
                goto out;
        retval = prepare_binprm(bprm);
@@ -1333,21 +1320,18 @@ int do_execve(char * filename,
        current->flags &= ~PF_KTHREAD;
        retval = search_binary_handler(bprm,regs);
-        if (retval >= 0) {
+        if (retval < 0)
-                /* execve success */
+                goto out;
-                security_bprm_free(bprm);
-                acct_update_integrals(current);
-                free_bprm(bprm);
-                if (displaced)
-                        put_files_struct(displaced);
-                return retval;
-        }
-out:
+        /* execve succeeded */
-        if (bprm->security)
+        mutex_unlock(&current->cred_exec_mutex);
-                security_bprm_free(bprm);
+        acct_update_integrals(current);
+        free_bprm(bprm);
+        if (displaced)
+                put_files_struct(displaced);
+        return retval;
-out_mm:
+out:
        if (bprm->mm)
                mmput (bprm->mm);
@@ -1356,7 +1340,11 @@ out_file:
                allow_write_access(bprm->file);
                fput(bprm->file);
        }
-out_kfree:
+out_unlock:
+        mutex_unlock(&current->cred_exec_mutex);
+out_free:
        free_bprm(bprm);
 out_files:
@@ -1388,6 +1376,7 @@ EXPORT_SYMBOL(set_binfmt);
 */
 static int format_corename(char *corename, long signr)
 {
+        const struct cred *cred = current_cred();
        const char *pat_ptr = core_pattern;
        int ispipe = (*pat_ptr == '|');
        char *out_ptr = corename;
@@ -1424,7 +1413,7 @@ static int format_corename(char *corename, long signr)
                        /* uid */
                        case 'u':
                                rc = snprintf(out_ptr, out_end - out_ptr,
-                                              "%d", current->uid);
+                                              "%d", cred->uid);
                                if (rc > out_end - out_ptr)
                                        goto out;
                                out_ptr += rc;
@@ -1432,7 +1421,7 @@ static int format_corename(char *corename, long signr)
                        /* gid */
                        case 'g':
                                rc = snprintf(out_ptr, out_end - out_ptr,
-                                              "%d", current->gid);
+                                              "%d", cred->gid);
                                if (rc > out_end - out_ptr)
                                        goto out;
                                out_ptr += rc;
@@ -1708,8 +1697,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        struct linux_binfmt * binfmt;
        struct inode * inode;
        struct file * file;
+        const struct cred *old_cred;
+        struct cred *cred;
        int retval = 0;
-        int fsuid = current->fsuid;
        int flag = 0;
        int ispipe = 0;
        unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
@@ -1722,12 +1712,20 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        binfmt = current->binfmt;
        if (!binfmt || !binfmt->core_dump)
                goto fail;
+        cred = prepare_creds();
+        if (!cred) {
+                retval = -ENOMEM;
+                goto fail;
+        }
        down_write(&mm->mmap_sem);
        /*
         * If another thread got here first, or we are not dumpable, bail out.
         */
        if (mm->core_state || !get_dumpable(mm)) {
                up_write(&mm->mmap_sem);
+                put_cred(cred);
                goto fail;
        }
@@ -1738,12 +1736,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
         */
        if (get_dumpable(mm) == 2) {    /* Setuid core dump mode */
                flag = O_EXCL;          /* Stop rewrite attacks */
-                current->fsuid = 0;     /* Dump root private */
+                cred->fsuid = 0;        /* Dump root private */
        }
        retval = coredump_wait(exit_code, &core_state);
-        if (retval < 0)
+        if (retval < 0) {
+                put_cred(cred);
                goto fail;
+        }
+        old_cred = override_creds(cred);
        /*
         * Clear any false indication of pending signals that might
@@ -1815,7 +1817,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
         * Dont allow local users get cute and trick others to coredump
         * into their pre-created files:
         */
-        if (inode->i_uid != current->fsuid)
+        if (inode->i_uid != current_fsuid())
                goto close_fail;
        if (!file->f_op)
                goto close_fail;
@@ -1834,7 +1836,8 @@ fail_unlock:
        if (helper_argv)
                argv_free(helper_argv);
-        current->fsuid = fsuid;
+        revert_creds(old_cred);
+        put_cred(cred);
        coredump_finish(mm);
 fail:
        return retval;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 890e0182881..197c7db583c 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/sched.h>
 #define dprintk(fmt, args...) do{}while(0)
@@ -249,6 +250,7 @@ static int filldir_one(void * __buf, const char * name, int len,
 static int get_name(struct vfsmount *mnt, struct dentry *dentry,
                char *name, struct dentry *child)
 {
+        const struct cred *cred = current_cred();
        struct inode *dir = dentry->d_inode;
        int error;
        struct file *file;
@@ -263,7 +265,7 @@ static int get_name(struct vfsmount *mnt, struct dentry *dentry,
        /*
         * Open the directory ...
         */
-        file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY);
+        file = dentry_open(dget(dentry), mntget(mnt), O_RDONLY, cred);
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto out;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 6dac7ba2d22..4a29d637608 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -1193,7 +1193,7 @@ static int ext2_has_free_blocks(struct ext2_sb_info *sbi)
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
        root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-                sbi->s_resuid != current->fsuid &&
+                sbi->s_resuid != current_fsuid() &&
                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
                return 0;
        }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index f5974134676..c454d5db28a 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -550,7 +550,7 @@ got:
        sb->s_dirt = 1;
        mark_buffer_dirty(bh2);
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        if (test_opt (sb, GRPID))
                inode->i_gid = dir->i_gid;
        else if (dir->i_mode & S_ISGID) {
@@ -558,7 +558,7 @@ got:
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        inode->i_mode = mode;
        inode->i_ino = ino;
@@ -585,7 +585,10 @@ got:
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        if (DQUOT_ALLOC_INODE(inode)) {
                err = -EDQUOT;
@@ -612,6 +615,7 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
        return ERR_PTR(err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33e265..02b39a5deb7 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -1286,9 +1287,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                else
                        inode->i_mapping->a_ops = &ext2_aops;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext2_inode_is_fast_symlink(inode))
+                if (ext2_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext2_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext2_symlink_inode_operations;
                        if (test_opt(inode->i_sb, NOBH))
                                inode->i_mapping->a_ops = &ext2_nobh_aops;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a747252ec1..90ea17998a7 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
        int err = ext2_add_link(dentry, inode);
        if (!err) {
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -170,6 +172,7 @@ out:
 out_fail:
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput (inode);
        goto out;
 }
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
+        int err;
        if (inode->i_nlink >= EXT2_LINK_MAX)
                return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
-        return ext2_add_nondir(dentry, inode);
+        err = ext2_add_link(dentry, inode);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        inode_dec_link_count(inode);
+        iput(inode);
+        return err;
 }
 static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
                goto out_fail;
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out:
        return err;
 out_fail:
        inode_dec_link_count(inode);
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
 out_dir:
        inode_dec_link_count(dir);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index f5b57a2ca35..0dbf1c04847 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1422,7 +1422,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
        root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
-                sbi->s_resuid != current->fsuid &&
+                sbi->s_resuid != current_fsuid() &&
                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
                return 0;
        }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 47b678d73e7..5655fbcbd11 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -539,7 +539,7 @@ got:
                percpu_counter_inc(&sbi->s_dirs_counter);
        sb->s_dirt = 1;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        if (test_opt (sb, GRPID))
                inode->i_gid = dir->i_gid;
        else if (dir->i_mode & S_ISGID) {
@@ -547,7 +547,7 @@ got:
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        inode->i_mode = mode;
        inode->i_ino = ino;
@@ -579,7 +579,10 @@ got:
        ext3_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
                handle->h_sync = 1;
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +630,7 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
        brelse(bitmap_bh);
        return ERR_PTR(err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad8997..5fa453b49a6 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "xattr.h"
 #include "acl.h"
@@ -1160,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
        to = from + len;
 retry:
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                inode->i_op = &ext3_dir_inode_operations;
                inode->i_fop = &ext3_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext3_inode_is_fast_symlink(inode))
+                if (ext3_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext3_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext3_symlink_inode_operations;
                        ext3_set_aops(inode);
                }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc92aa0..1dd2abe6313 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1652,9 +1652,11 @@ static int ext3_add_nondir(handle_t *handle,
        if (!err) {
                ext3_mark_inode_dirty(handle, inode);
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        drop_nlink(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -1765,6 +1767,7 @@ retry:
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
        if (!dir_block) {
                drop_nlink(inode); /* is this nlink == 0? */
+                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
@@ -1792,6 +1795,7 @@ retry:
        err = ext3_add_entry (handle, dentry, inode);
        if (err) {
                inode->i_nlink = 0;
+                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
@@ -1800,6 +1804,7 @@ retry:
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out_stop:
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2170,10 +2175,10 @@ retry:
                 * We have a transaction open.  All is sweetness.  It also sets
                 * i_size in generic_commit_write().
                 */
-                err = __page_symlink(inode, symname, l,
+                err = __page_symlink(inode, symname, l, 1);
-                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
                if (err) {
                        drop_nlink(inode);
+                        unlock_new_inode(inode);
                        ext3_mark_inode_dirty(handle, inode);
                        iput (inode);
                        goto out_stop;
@@ -2221,7 +2226,14 @@ retry:
        inc_nlink(inode);
        atomic_inc(&inode->i_count);
-        err = ext3_add_nondir(handle, dentry, inode);
+        err = ext3_add_entry(handle, dentry, inode);
+        if (!err) {
+                ext3_mark_inode_dirty(handle, inode);
+                d_instantiate(dentry, inode);
+        } else {
+                drop_nlink(inode);
+                iput(inode);
+        }
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2003cdc36a..38b3acf5683 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,8 +609,8 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
        if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
                                                EXT4_FREEBLOCKS_WATERMARK) {
-                free_blocks  = percpu_counter_sum(fbc);
+                free_blocks  = percpu_counter_sum_positive(fbc);
-                dirty_blocks = percpu_counter_sum(dbc);
+                dirty_blocks = percpu_counter_sum_positive(dbc);
                if (dirty_blocks < 0) {
                        printk(KERN_CRIT "Dirty block accounting "
                                        "went wrong %lld\n",
@@ -624,7 +624,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                return 1;
        /* Hm, nope.  Are (enough) root reserved blocks available? */
-        if (sbi->s_resuid == current->fsuid ||
+        if (sbi->s_resuid == current_fsuid() ||
            ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
            capable(CAP_SYS_RESOURCE)) {
                if (free_blocks >= (nblocks + dirty_blocks))
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df..b21f16713db 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -146,4 +146,10 @@ struct ext4_sb_info {
        struct flex_groups *s_flex_groups;
 };
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+        return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2a117e286e5..6e6052879aa 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -787,7 +787,7 @@ got:
                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        if (test_opt(sb, GRPID))
                inode->i_gid = dir->i_gid;
        else if (dir->i_mode & S_ISGID) {
@@ -795,7 +795,7 @@ got:
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        inode->i_mode = mode;
        inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
@@ -826,7 +826,10 @@ got:
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
                handle->h_sync = 1;
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
@@ -881,6 +884,7 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
        brelse(bitmap_bh);
        return ERR_PTR(err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5ae33c..6702a49992a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
+#include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
@@ -1345,7 +1346,7 @@ retry:
                goto out;
        }
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                ext4_journal_stop(handle);
                ret = -ENOMEM;
@@ -2549,7 +2550,7 @@ retry:
                goto out;
        }
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                ext4_journal_stop(handle);
                ret = -ENOMEM;
@@ -4164,9 +4165,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext4_inode_is_fast_symlink(inode))
+                if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext4_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                        ext4_set_aops(inode);
                }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb79298..9fd2a5e1be4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1693,9 +1693,11 @@ static int ext4_add_nondir(handle_t *handle,
        if (!err) {
                ext4_mark_inode_dirty(handle, inode);
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        drop_nlink(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -1830,6 +1832,7 @@ retry:
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
+                unlock_new_inode(inode);
                ext4_mark_inode_dirty(handle, inode);
                iput(inode);
                goto out_stop;
@@ -1838,6 +1841,7 @@ out_clear_inode:
        ext4_update_dx_flag(dir);
        ext4_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out_stop:
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -2208,10 +2212,10 @@ retry:
                 * We have a transaction open.  All is sweetness.  It also sets
                 * i_size in generic_commit_write().
                 */
-                err = __page_symlink(inode, symname, l,
+                err = __page_symlink(inode, symname, l, 1);
-                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
                if (err) {
                        clear_nlink(inode);
+                        unlock_new_inode(inode);
                        ext4_mark_inode_dirty(handle, inode);
                        iput(inode);
                        goto out_stop;
@@ -2262,7 +2266,14 @@ retry:
        ext4_inc_count(handle, inode);
        atomic_inc(&inode->i_count);
-        err = ext4_add_nondir(handle, dentry, inode);
+        err = ext4_add_entry(handle, dentry, inode);
+        if (!err) {
+                ext4_mark_inode_dirty(handle, inode);
+                d_instantiate(dentry, inode);
+        } else {
+                drop_nlink(inode);
+                iput(inode);
+        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65db..04158ad74db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1721,7 +1721,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
        /* small i_blocks in vfs inode? */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * CONFIG_LSF is not enabled implies the inode
+                 * CONFIG_LBD is not enabled implies the inode
                 * i_block represent total blocks in 512 bytes
                 * 32 == size of vfs inode i_blocks * 8
                 */
@@ -1764,7 +1764,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * !has_huge_files or CONFIG_LSF is not enabled
+                 * !has_huge_files or CONFIG_LBD is not enabled
                 * implies the inode i_block represent total blocks in
                 * 512 bytes 32 == size of vfs inode i_blocks * 8
                 */
@@ -2021,13 +2021,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (has_huge_files) {
                /*
                 * Large file size enabled file system can only be
-                 * mount if kernel is build with CONFIG_LSF
+                 * mount if kernel is build with CONFIG_LBD
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
                        printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LSF.\n", sb->s_id);
+                                        "without CONFIG_LBD.\n", sb->s_id);
                        goto failed_mount;
                }
        }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 67e05835709..3a7f603b698 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = {
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
        .fsync          = file_fsync,
-        .llseek         = generic_file_llseek,
 };
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f06a4e525ec..0a7f4a9918b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -304,7 +304,7 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
 {
        mode_t allow_utime = sbi->options.allow_utime;
-        if (current->fsuid != inode->i_uid) {
+        if (current_fsuid() != inode->i_uid) {
                if (in_group_p(inode->i_gid))
                        allow_utime >>= 3;
                if (allow_utime & MAY_WRITE)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index bdd8fb7be2c..6b74d09adbe 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
        brelse(bh);
        parent = d_obtain_alias(inode);
+        if (!IS_ERR(parent))
+                parent->d_op = sb->s_root->d_op;
 out:
        unlock_super(sb);
@@ -926,8 +928,8 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->isvfat = is_vfat;
-        opts->fs_uid = current->uid;
+        opts->fs_uid = current_uid();
-        opts->fs_gid = current->gid;
+        opts->fs_gid = current_gid();
        opts->fs_fmask = opts->fs_dmask = current->fs->umask;
        opts->allow_utime = -1;
        opts->codepage = fat_default_codepage;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bf326d4356a..8ae32e37673 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
         * for creation.
         */
        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
-                if (nd->flags & LOOKUP_CREATE)
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
                        return 0;
        }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 549daf8005f..cdc14194672 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -212,13 +212,14 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
 int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
 {
+        const struct cred *cred = current_cred();
        int err;
        
        err = security_file_set_fowner(filp);
        if (err)
                return err;
-        f_modown(filp, pid, type, current->uid, current->euid, force);
+        f_modown(filp, pid, type, cred->uid, cred->euid, force);
        return 0;
 }
 EXPORT_SYMBOL(__f_setown);
@@ -407,10 +408,17 @@ static const long band_table[NSIGPOLL] = {
 static inline int sigio_perm(struct task_struct *p,
                             struct fown_struct *fown, int sig)
 {
-        return (((fown->euid == 0) ||
+        const struct cred *cred;
-                 (fown->euid == p->suid) || (fown->euid == p->uid) ||
+        int ret;
-                 (fown->uid == p->suid) || (fown->uid == p->uid)) &&
-                !security_file_send_sigiotask(p, fown, sig));
+        rcu_read_lock();
+        cred = __task_cred(p);
+        ret = ((fown->euid == 0 ||
+                fown->euid == cred->suid || fown->euid == cred->uid ||
+                fown->uid  == cred->suid || fown->uid  == cred->uid) &&
+               !security_file_send_sigiotask(p, fown, sig));
+        rcu_read_unlock();
+        return ret;
 }
 static void send_sigio_to_task(struct task_struct *p,
diff --git a/fs/file_table.c b/fs/file_table.c
index 5ad0eca6eea..bbeeac6efa1 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,11 +32,16 @@ struct files_stat_struct files_stat = {
 /* public. Not pretty! */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 static inline void file_free_rcu(struct rcu_head *head)
 {
-        struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
+        struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
+        put_cred(f->f_cred);
        kmem_cache_free(filp_cachep, f);
 }
@@ -94,7 +99,7 @@ int proc_nr_files(ctl_table *table, int write, struct file *filp,
 */
 struct file *get_empty_filp(void)
 {
-        struct task_struct *tsk;
+        const struct cred *cred = current_cred();
        static int old_max;
        struct file * f;
@@ -118,12 +123,10 @@ struct file *get_empty_filp(void)
        if (security_file_alloc(f))
                goto fail_sec;
-        tsk = current;
        INIT_LIST_HEAD(&f->f_u.fu_list);
        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
-        f->f_uid = tsk->fsuid;
+        f->f_cred = get_cred(cred);
-        f->f_gid = tsk->fsgid;
        eventpoll_init_file(f);
        /* f->f_version: 0 */
        return f;
@@ -397,7 +400,12 @@ too_bad:
 void __init files_init(unsigned long mempages)
 { 
        int n; 
-        /* One file with associated inode and dcache is very roughly 1K. 
+        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+                        SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+        /*
+         * One file with associated inode and dcache is very roughly 1K.
         * Per default don't use more than 10% of our memory for files. 
         */ 
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ceb73f..03a6ea5e99f 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
                if (!VXFS_ISIMMED(vip)) {
                        ip->i_op = &page_symlink_inode_operations;
                        ip->i_mapping->a_ops = &vxfs_aops;
-                } else
+                } else {
                        ip->i_op = &vxfs_immed_symlink_iops;
+                        vip->vii_immed.vi_immed[ip->i_size] = '\0';
+                }
        } else
                init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b72361479be..fba571648a8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -87,8 +87,8 @@ static void __fuse_put_request(struct fuse_req *req)
 static void fuse_req_init_context(struct fuse_req *req)
 {
-        req->in.h.uid = current->fsuid;
+        req->in.h.uid = current_fsuid();
-        req->in.h.gid = current->fsgid;
+        req->in.h.gid = current_fsgid();
        req->in.h.pid = current->pid;
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fd03330cade..95bc22bdd06 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -869,18 +869,25 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 */
 int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
 {
+        const struct cred *cred;
+        int ret;
        if (fc->flags & FUSE_ALLOW_OTHER)
                return 1;
-        if (task->euid == fc->user_id &&
+        rcu_read_lock();
-            task->suid == fc->user_id &&
+        ret = 0;
-            task->uid == fc->user_id &&
+        cred = __task_cred(task);
-            task->egid == fc->group_id &&
+        if (cred->euid == fc->user_id &&
-            task->sgid == fc->group_id &&
+            cred->suid == fc->user_id &&
-            task->gid == fc->group_id)
+            cred->uid  == fc->user_id &&
-                return 1;
+            cred->egid == fc->group_id &&
+            cred->sgid == fc->group_id &&
+            cred->gid  == fc->group_id)
+                ret = 1;
+        rcu_read_unlock();
-        return 0;
+        return ret;
 }
 static int fuse_access(struct inode *inode, int mask)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b8..4c9ee701126 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -646,7 +646,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
@@ -779,7 +779,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                        break;
                err = -ENOMEM;
-                page = __grab_cache_page(mapping, index);
+                page = grab_cache_page_write_begin(mapping, index, 0);
                if (!page)
                        break;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7cee695fa44..d57616840e8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -705,18 +705,18 @@ static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
            (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
                if (S_ISDIR(*mode))
                        *mode |= S_ISUID;
-                else if (dip->i_inode.i_uid != current->fsuid)
+                else if (dip->i_inode.i_uid != current_fsuid())
                        *mode &= ~07111;
                *uid = dip->i_inode.i_uid;
        } else
-                *uid = current->fsuid;
+                *uid = current_fsuid();
        if (dip->i_inode.i_mode & S_ISGID) {
                if (S_ISDIR(*mode))
                        *mode |= S_ISGID;
                *gid = dip->i_inode.i_gid;
        } else
-                *gid = current->fsgid;
+                *gid = current_fsgid();
 }
 static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
@@ -1124,8 +1124,8 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
                return -EPERM;
        if ((dip->i_inode.i_mode & S_ISVTX) &&
-            dip->i_inode.i_uid != current->fsuid &&
+            dip->i_inode.i_uid != current_fsuid() &&
-            ip->i_inode.i_uid != current->fsuid && !capable(CAP_FOWNER))
+            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
                return -EPERM;
        if (IS_APPEND(&dip->i_inode))
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c..15f710f2d4d 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -675,7 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                goto out_trans_fail;
        error = -ENOMEM;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        *pagep = page;
        if (unlikely(!page))
                goto out_endtrans;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index c69b7ac75bf..9435dda8f1e 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -155,8 +155,8 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
        hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
        inode->i_ino = HFS_SB(sb)->next_id++;
        inode->i_mode = mode;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = current->fsgid;
+        inode->i_gid = current_fsgid();
        inode->i_nlink = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        HFS_I(inode)->flags = 0;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 3c7c7637719..c8b5acf4b0b 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -210,8 +210,8 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
        int tmp, token;
        /* initialize the sb with defaults */
-        hsb->s_uid = current->uid;
+        hsb->s_uid = current_uid();
-        hsb->s_gid = current->gid;
+        hsb->s_gid = current_gid();
        hsb->s_file_umask = 0133;
        hsb->s_dir_umask = 0022;
        hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b207f0e6fc2..f105ee9e1cc 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -296,8 +296,8 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
        inode->i_ino = HFSPLUS_SB(sb).next_cnid++;
        inode->i_mode = mode;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = current->fsgid;
+        inode->i_gid = current_fsgid();
        inode->i_nlink = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 9699c56d323..bab7f8d1bdf 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -49,8 +49,8 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
        opts->creator = HFSPLUS_DEF_CR_TYPE;
        opts->type = HFSPLUS_DEF_CR_TYPE;
        opts->umask = current->fs->umask;
-        opts->uid = current->uid;
+        opts->uid = current_uid();
-        opts->gid = current->gid;
+        opts->gid = current_gid();
        opts->part = -1;
        opts->session = -1;
 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac17..5c538e0ec14 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 10783f3d265..b649232dde9 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -92,11 +92,11 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inc_nlink(dir);
        insert_inode_hash(result);
-        if (result->i_uid != current->fsuid ||
+        if (result->i_uid != current_fsuid() ||
-            result->i_gid != current->fsgid ||
+            result->i_gid != current_fsgid() ||
            result->i_mode != (mode | S_IFDIR)) {
-                result->i_uid = current->fsuid;
+                result->i_uid = current_fsuid();
-                result->i_gid = current->fsgid;
+                result->i_gid = current_fsgid();
                result->i_mode = mode | S_IFDIR;
                hpfs_write_inode_nolock(result);
        }
@@ -184,11 +184,11 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        insert_inode_hash(result);
-        if (result->i_uid != current->fsuid ||
+        if (result->i_uid != current_fsuid() ||
-            result->i_gid != current->fsgid ||
+            result->i_gid != current_fsgid() ||
            result->i_mode != (mode | S_IFREG)) {
-                result->i_uid = current->fsuid;
+                result->i_uid = current_fsuid();
-                result->i_gid = current->fsgid;
+                result->i_gid = current_fsgid();
                result->i_mode = mode | S_IFREG;
                hpfs_write_inode_nolock(result);
        }
@@ -247,8 +247,8 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        result->i_mtime.tv_nsec = 0;
        result->i_atime.tv_nsec = 0;
        hpfs_i(result)->i_ea_size = 0;
-        result->i_uid = current->fsuid;
+        result->i_uid = current_fsuid();
-        result->i_gid = current->fsgid;
+        result->i_gid = current_fsgid();
        result->i_nlink = 1;
        result->i_size = 0;
        result->i_blocks = 1;
@@ -325,8 +325,8 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        result->i_atime.tv_nsec = 0;
        hpfs_i(result)->i_ea_size = 0;
        result->i_mode = S_IFLNK | 0777;
-        result->i_uid = current->fsuid;
+        result->i_uid = current_fsuid();
-        result->i_gid = current->fsgid;
+        result->i_gid = current_fsgid();
        result->i_blocks = 1;
        result->i_nlink = 1;
        result->i_size = strlen(symlink);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 29ad461d568..0d049b8919c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -475,8 +475,8 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        init_MUTEX(&sbi->hpfs_creation_de);
-        uid = current->uid;
+        uid = current_uid();
-        gid = current->gid;
+        gid = current_gid();
        umask = current->fs->umask;
        lowercase = 0;
        conv = CONV_BINARY;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2b3d1828db9..b278f7f5202 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -426,6 +426,7 @@ static int file_mode(int fmode)
 static int hppfs_open(struct inode *inode, struct file *file)
 {
+        const struct cred *cred = file->f_cred;
        struct hppfs_private *data;
        struct vfsmount *proc_mnt;
        struct dentry *proc_dentry;
@@ -446,7 +447,7 @@ static int hppfs_open(struct inode *inode, struct file *file)
        /* XXX This isn't closed anywhere */
        data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
-                                      file_mode(file->f_mode));
+                                      file_mode(file->f_mode), cred);
        err = PTR_ERR(data->proc_file);
        if (IS_ERR(data->proc_file))
                goto out_free1;
@@ -489,6 +490,7 @@ static int hppfs_open(struct inode *inode, struct file *file)
 static int hppfs_dir_open(struct inode *inode, struct file *file)
 {
+        const struct cred *cred = file->f_cred;
        struct hppfs_private *data;
        struct vfsmount *proc_mnt;
        struct dentry *proc_dentry;
@@ -502,7 +504,7 @@ static int hppfs_dir_open(struct inode *inode, struct file *file)
        proc_dentry = HPPFS_I(inode)->proc_dentry;
        proc_mnt = inode->i_sb->s_fs_info;
        data->proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt),
-                                      file_mode(file->f_mode));
+                                      file_mode(file->f_mode), cred);
        err = PTR_ERR(data->proc_file);
        if (IS_ERR(data->proc_file))
                goto out_free;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 61edc701b0e..7d479ce3ace 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -551,9 +551,9 @@ static int hugetlbfs_mknod(struct inode *dir,
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else {
-                gid = current->fsgid;
+                gid = current_fsgid();
        }
-        inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, gid, mode, dev);
+        inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), gid, mode, dev);
        if (inode) {
                dir->i_ctime = dir->i_mtime = CURRENT_TIME;
                d_instantiate(dentry, inode);
@@ -586,9 +586,9 @@ static int hugetlbfs_symlink(struct inode *dir,
        if (dir->i_mode & S_ISGID)
                gid = dir->i_gid;
        else
-                gid = current->fsgid;
+                gid = current_fsgid();
-        inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid,
+        inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(),
                                        gid, S_IFLNK|S_IRWXUGO, 0);
        if (inode) {
                int l = strlen(symname)+1;
@@ -854,8 +854,8 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
        config.nr_blocks = -1; /* No limit on size by default */
        config.nr_inodes = -1; /* No limit on number of inodes by default */
-        config.uid = current->fsuid;
+        config.uid = current_fsuid();
-        config.gid = current->fsgid;
+        config.gid = current_fsgid();
        config.mode = 0755;
        config.hstate = &default_hstate;
        ret = hugetlbfs_parse_options(data, &config);
@@ -951,6 +951,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
        struct inode *inode;
        struct dentry *dentry, *root;
        struct qstr quick_string;
+        struct user_struct *user = current_user();
        if (!hugetlbfs_vfsmount)
                return ERR_PTR(-ENOENT);
@@ -958,7 +959,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
        if (!can_do_hugetlb_shm())
                return ERR_PTR(-EPERM);
-        if (!user_shm_lock(size, current->user))
+        if (!user_shm_lock(size, user))
                return ERR_PTR(-ENOMEM);
        root = hugetlbfs_vfsmount->mnt_root;
@@ -970,8 +971,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
                goto out_shm_unlock;
        error = -ENOSPC;
-        inode = hugetlbfs_get_inode(root->d_sb, current->fsuid,
+        inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(),
-                                current->fsgid, S_IFREG | S_IRWXUGO, 0);
+                                current_fsgid(), S_IFREG | S_IRWXUGO, 0);
        if (!inode)
                goto out_dentry;
@@ -998,7 +999,7 @@ out_inode:
 out_dentry:
        dput(dentry);
 out_shm_unlock:
-        user_shm_unlock(size, current->user);
+        user_shm_unlock(size, user);
        return ERR_PTR(error);
 }
diff --git a/fs/inode.c b/fs/inode.c
index 0487ddba139..7de1cda9248 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -108,84 +108,100 @@ static void wake_up_inode(struct inode *inode)
        wake_up_bit(&inode->i_state, __I_LOCK);
 }
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb          - superblock inode belongs to.
+ * @inode       - inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
-        struct inode *inode;
-        if (sb->s_op->alloc_inode)
+        struct address_space * const mapping = &inode->i_data;
-                inode = sb->s_op->alloc_inode(sb);
-        else
+        inode->i_sb = sb;
-                inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+        inode->i_blkbits = sb->s_blocksize_bits;
+        inode->i_flags = 0;
-        if (inode) {
+        atomic_set(&inode->i_count, 1);
-                struct address_space * const mapping = &inode->i_data;
+        inode->i_op = &empty_iops;
+        inode->i_fop = &empty_fops;
-                inode->i_sb = sb;
+        inode->i_nlink = 1;
-                inode->i_blkbits = sb->s_blocksize_bits;
+        atomic_set(&inode->i_writecount, 0);
-                inode->i_flags = 0;
+        inode->i_size = 0;
-                atomic_set(&inode->i_count, 1);
+        inode->i_blocks = 0;
-                inode->i_op = &empty_iops;
+        inode->i_bytes = 0;
-                inode->i_fop = &empty_fops;
+        inode->i_generation = 0;
-                inode->i_nlink = 1;
-                atomic_set(&inode->i_writecount, 0);
-                inode->i_size = 0;
-                inode->i_blocks = 0;
-                inode->i_bytes = 0;
-                inode->i_generation = 0;
 #ifdef CONFIG_QUOTA
-                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+        memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 #endif
-                inode->i_pipe = NULL;
+        inode->i_pipe = NULL;
-                inode->i_bdev = NULL;
+        inode->i_bdev = NULL;
-                inode->i_cdev = NULL;
+        inode->i_cdev = NULL;
-                inode->i_rdev = 0;
+        inode->i_rdev = 0;
-                inode->dirtied_when = 0;
+        inode->dirtied_when = 0;
-                if (security_inode_alloc(inode)) {
+        if (security_inode_alloc(inode)) {
-                        if (inode->i_sb->s_op->destroy_inode)
+                if (inode->i_sb->s_op->destroy_inode)
-                                inode->i_sb->s_op->destroy_inode(inode);
+                        inode->i_sb->s_op->destroy_inode(inode);
-                        else
+                else
-                                kmem_cache_free(inode_cachep, (inode));
+                        kmem_cache_free(inode_cachep, (inode));
-                        return NULL;
+                return NULL;
-                }
+        }
-                spin_lock_init(&inode->i_lock);
+        spin_lock_init(&inode->i_lock);
-                lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
-                mutex_init(&inode->i_mutex);
+        mutex_init(&inode->i_mutex);
-                lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+        lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
-                init_rwsem(&inode->i_alloc_sem);
+        init_rwsem(&inode->i_alloc_sem);
-                lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+        lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
-                mapping->a_ops = &empty_aops;
+        mapping->a_ops = &empty_aops;
-                mapping->host = inode;
+        mapping->host = inode;
-                mapping->flags = 0;
+        mapping->flags = 0;
-                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
-                mapping->assoc_mapping = NULL;
+        mapping->assoc_mapping = NULL;
-                mapping->backing_dev_info = &default_backing_dev_info;
+        mapping->backing_dev_info = &default_backing_dev_info;
-                mapping->writeback_index = 0;
+        mapping->writeback_index = 0;
-                /*
+        /*
-                 * If the block_device provides a backing_dev_info for client
+         * If the block_device provides a backing_dev_info for client
-                 * inodes then use that.  Otherwise the inode share the bdev's
+         * inodes then use that.  Otherwise the inode share the bdev's
-                 * backing_dev_info.
+         * backing_dev_info.
-                 */
+         */
-                if (sb->s_bdev) {
+        if (sb->s_bdev) {
-                        struct backing_dev_info *bdi;
+                struct backing_dev_info *bdi;
-                        bdi = sb->s_bdev->bd_inode_backing_dev_info;
+                bdi = sb->s_bdev->bd_inode_backing_dev_info;
-                        if (!bdi)
+                if (!bdi)
-                                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+                        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                        mapping->backing_dev_info = bdi;
+                mapping->backing_dev_info = bdi;
-                }
-                inode->i_private = NULL;
-                inode->i_mapping = mapping;
        }
+        inode->i_private = NULL;
+        inode->i_mapping = mapping;
        return inode;
 }
+EXPORT_SYMBOL(inode_init_always);
+static struct inode *alloc_inode(struct super_block *sb)
+{
+        struct inode *inode;
+        if (sb->s_op->alloc_inode)
+                inode = sb->s_op->alloc_inode(sb);
+        else
+                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+        if (inode)
+                return inode_init_always(sb, inode);
+        return NULL;
+}
 void destroy_inode(struct inode *inode) 
 {
@@ -196,6 +212,7 @@ void destroy_inode(struct inode *inode)
        else
                kmem_cache_free(inode_cachep, (inode));
 }
+EXPORT_SYMBOL(destroy_inode);
 /*
@@ -534,6 +551,49 @@ repeat:
        return node ? inode : NULL;
 }
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+        unsigned long tmp;
+        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                        L1_CACHE_BYTES;
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        return tmp & I_HASHMASK;
+}
+static inline void
+__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
+                        struct inode *inode)
+{
+        inodes_stat.nr_inodes++;
+        list_add(&inode->i_list, &inode_in_use);
+        list_add(&inode->i_sb_list, &sb->s_inodes);
+        if (head)
+                hlist_add_head(&inode->i_hash, head);
+}
+/**
+ * inode_add_to_lists - add a new inode to relevant lists
+ * @sb          - superblock inode belongs to.
+ * @inode       - inode to mark in use
+ *
+ * When an inode is allocated it needs to be accounted for, added to the in use
+ * list, the owning superblock and the inode hash. This needs to be done under
+ * the inode_lock, so export a function to do this rather than the inode lock
+ * itself. We calculate the hash list to add to here so it is all internal
+ * which requires the caller to have already set up the inode number in the
+ * inode to add.
+ */
+void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+{
+        struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+        spin_lock(&inode_lock);
+        __inode_add_to_lists(sb, head, inode);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_add_to_lists);
 /**
 *      new_inode       - obtain an inode
 *      @sb: superblock
@@ -561,9 +621,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-                inodes_stat.nr_inodes++;
+                __inode_add_to_lists(sb, NULL, inode);
-                list_add(&inode->i_list, &inode_in_use);
-                list_add(&inode->i_sb_list, &sb->s_inodes);
                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
@@ -622,10 +680,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
                        if (set(inode, data))
                                goto set_failed;
-                        inodes_stat.nr_inodes++;
+                        __inode_add_to_lists(sb, head, inode);
-                        list_add(&inode->i_list, &inode_in_use);
-                        list_add(&inode->i_sb_list, &sb->s_inodes);
-                        hlist_add_head(&inode->i_hash, head);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
@@ -671,10 +726,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        inodes_stat.nr_inodes++;
+                        __inode_add_to_lists(sb, head, inode);
-                        list_add(&inode->i_list, &inode_in_use);
-                        list_add(&inode->i_sb_list, &sb->s_inodes);
-                        hlist_add_head(&inode->i_hash, head);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
@@ -698,16 +750,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
        return inode;
 }
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-        unsigned long tmp;
-        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-        return tmp & I_HASHMASK;
-}
 /**
 *      iunique - get a unique inode number
 *      @sb: superblock
@@ -990,6 +1032,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 EXPORT_SYMBOL(iget_locked);
+int insert_inode_locked(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        ino_t ino = inode->i_ino;
+        struct hlist_head *head = inode_hashtable + hash(sb, ino);
+        struct inode *old;
+        inode->i_state |= I_LOCK|I_NEW;
+        while (1) {
+                spin_lock(&inode_lock);
+                old = find_inode_fast(sb, head, ino);
+                if (likely(!old)) {
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode_lock);
+                        return 0;
+                }
+                __iget(old);
+                spin_unlock(&inode_lock);
+                wait_on_inode(old);
+                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                        iput(old);
+                        return -EBUSY;
+                }
+                iput(old);
+        }
+}
+EXPORT_SYMBOL(insert_inode_locked);
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+                int (*test)(struct inode *, void *), void *data)
+{
+        struct super_block *sb = inode->i_sb;
+        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *old;
+        inode->i_state |= I_LOCK|I_NEW;
+        while (1) {
+                spin_lock(&inode_lock);
+                old = find_inode(sb, head, test, data);
+                if (likely(!old)) {
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode_lock);
+                        return 0;
+                }
+                __iget(old);
+                spin_unlock(&inode_lock);
+                wait_on_inode(old);
+                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                        iput(old);
+                        return -EBUSY;
+                }
+                iput(old);
+        }
+}
+EXPORT_SYMBOL(insert_inode_locked4);
 /**
 *      __insert_inode_hash - hash an inode
 *      @inode: unhashed inode
@@ -1292,6 +1393,7 @@ int inode_wait(void *word)
        schedule();
        return 0;
 }
+EXPORT_SYMBOL(inode_wait);
 /*
 * If we try to find an inode in the inode hash while it is being
diff --git a/fs/internal.h b/fs/internal.h
index 80aa9a02337..53af885f173 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -10,6 +10,7 @@
 */
 struct super_block;
+struct linux_binprm;
 /*
 * block_dev.c
@@ -40,6 +41,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 extern void __init chrdev_init(void);
 /*
+ * exec.c
+ */
+extern void check_unsafe_exec(struct linux_binprm *);
+/*
 * namespace.c
 */
 extern int copy_mount_options(const void __user *, unsigned long *);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index da3cc460d4d..3569e0ad86a 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -31,10 +31,16 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 {
        int err;
        struct io_context *ioc;
+        const struct cred *cred = current_cred(), *tcred;
-        if (task->uid != current->euid &&
+        rcu_read_lock();
-            task->uid != current->uid && !capable(CAP_SYS_NICE))
+        tcred = __task_cred(task);
+        if (tcred->uid != cred->euid &&
+            tcred->uid != cred->uid && !capable(CAP_SYS_NICE)) {
+                rcu_read_unlock();
                return -EPERM;
+        }
+        rcu_read_unlock();
        err = security_task_setioprio(task, ioprio);
        if (err)
@@ -123,7 +129,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
-                                user = current->user;
+                                user = current_user();
                        else
                                user = find_user(who);
@@ -131,7 +137,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
                                break;
                        do_each_thread(g, p) {
-                                if (p->uid != who)
+                                if (__task_cred(p)->uid != who)
                                        continue;
                                ret = set_task_ioprio(p, ioprio);
                                if (ret)
@@ -216,7 +222,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                        break;
                case IOPRIO_WHO_USER:
                        if (!who)
-                                user = current->user;
+                                user = current_user();
                        else
                                user = find_user(who);
@@ -224,7 +230,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
                                break;
                        do_each_thread(g, p) {
-                                if (p->uid != user->uid)
+                                if (__task_cred(p)->uid != user->uid)
                                        continue;
                                tmpio = get_task_ioprio(p);
                                if (tmpio < 0)
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c85..5edc2bf2058 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
        uint32_t pageofs = index << PAGE_CACHE_SHIFT;
        int ret = 0;
-        pg = __grab_cache_page(mapping, index);
+        pg = grab_cache_page_write_begin(mapping, index, flags);
        if (!pg)
                return -ENOMEM;
        *pagep = pg;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 210339784b5..b00ee9f05a0 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -59,8 +59,14 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_size >= IDATASIZE) {
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_mapping->a_ops = &jfs_aops;
-                } else
+                } else {
                        inode->i_op = &jfs_symlink_inode_operations;
+                        /*
+                         * The inline data should be null-terminated, but
+                         * don't let on-disk corruption crash the kernel
+                         */
+                        JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+                }
        } else {
                inode->i_op = &jfs_file_inode_operations;
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index ed6574bee51..d4d142c2edd 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        inode = new_inode(sb);
        if (!inode) {
                jfs_warn("ialloc: new_inode returned NULL!");
-                return ERR_PTR(-ENOMEM);
+                rc = -ENOMEM;
+                goto fail;
        }
        jfs_inode = JFS_IP(inode);
@@ -89,17 +90,21 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                jfs_warn("ialloc: diAlloc returned %d!", rc);
                if (rc == -EIO)
                        make_bad_inode(inode);
-                iput(inode);
+                goto fail_put;
-                return ERR_PTR(rc);
        }
-        inode->i_uid = current->fsuid;
+        if (insert_inode_locked(inode) < 0) {
+                rc = -EINVAL;
+                goto fail_unlock;
+        }
+        inode->i_uid = current_fsuid();
        if (parent->i_mode & S_ISGID) {
                inode->i_gid = parent->i_gid;
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        /*
         * New inodes need to save sane values on disk when
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
         * Allocate inode to quota.
         */
        if (DQUOT_ALLOC_INODE(inode)) {
-                DQUOT_DROP(inode);
+                rc = -EDQUOT;
-                inode->i_flags |= S_NOQUOTA;
+                goto fail_drop;
-                inode->i_nlink = 0;
-                iput(inode);
-                return ERR_PTR(-EDQUOT);
        }
        inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        jfs_info("ialloc returns inode = 0x%p\n", inode);
        return inode;
+fail_drop:
+        DQUOT_DROP(inode);
+        inode->i_flags |= S_NOQUOTA;
+fail_unlock:
+        inode->i_nlink = 0;
+        unlock_new_inode(inode);
+fail_put:
+        iput(inode);
+fail:
+        return ERR_PTR(rc);
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedffbfa..b4de56b851e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        ip->i_fop = &jfs_file_operations;
        ip->i_mapping->a_ops = &jfs_aops;
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        ip->i_op = &jfs_dir_inode_operations;
        ip->i_fop = &jfs_dir_operations;
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        /* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
                goto out3;
        }
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        jfs_ip->dev = new_encode_dev(rdev);
        init_special_inode(ip, ip->i_mode, rdev);
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out1:
        free_UCSname(&dname);
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a832190..bdaec17fa38 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -360,7 +360,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8307dd64bf4..1f3b0fc0d35 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/smp_lock.h>
+#include <linux/kthread.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -60,7 +61,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
        host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
                                   nlm_init->protocol, nlm_version,
-                                   nlm_init->hostname);
+                                   nlm_init->hostname, nlm_init->noresvport);
        if (host == NULL) {
                lockd_down();
                return ERR_PTR(-ENOLCK);
@@ -191,11 +192,15 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 void
 nlmclnt_recovery(struct nlm_host *host)
 {
+        struct task_struct *task;
        if (!host->h_reclaiming++) {
                nlm_get_host(host);
-                __module_get(THIS_MODULE);
+                task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name);
-                if (kernel_thread(reclaimer, host, CLONE_FS | CLONE_FILES) < 0)
+                if (IS_ERR(task))
-                        module_put(THIS_MODULE);
+                        printk(KERN_ERR "lockd: unable to spawn reclaimer "
+                                "thread. Locks for %s won't be reclaimed! "
+                                "(%ld)\n", host->h_name, PTR_ERR(task));
        }
 }
@@ -207,7 +212,6 @@ reclaimer(void *ptr)
        struct file_lock *fl, *next;
        u32 nsmstate;
-        daemonize("%s-reclaim", host->h_name);
        allow_signal(SIGKILL);
        down_write(&host->h_rwsem);
@@ -233,7 +237,12 @@ restart:
        list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
                list_del_init(&fl->fl_u.nfs_fl.list);
-                /* Why are we leaking memory here? --okir */
+                /*
+                 * sending this thread a SIGKILL will result in any unreclaimed
+                 * locks being removed from the h_granted list. This means that
+                 * the kernel will not attempt to reclaim them again if a new
+                 * reclaimer thread is spawned for this host.
+                 */
                if (signalled())
                        continue;
                if (nlmclnt_reclaim(host, fl) != 0)
@@ -261,5 +270,5 @@ restart:
        nlm_release_host(host);
        lockd_down();
        unlock_kernel();
-        module_put_and_exit(0);
+        return 0;
 }
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 70fc63a1727..abdebf76b82 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -48,6 +48,7 @@ struct nlm_lookup_host_info {
        const size_t            hostname_len;   /* it's length */
        const struct sockaddr   *src_sap;       /* our address (optional) */
        const size_t            src_len;        /* it's length */
+        const int               noresvport;     /* use non-priv port */
 };
 /*
@@ -115,14 +116,14 @@ static void nlm_display_address(const struct sockaddr *sap,
                snprintf(buf, len, "unspecified");
                break;
        case AF_INET:
-                snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
+                snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
                break;
        case AF_INET6:
                if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-                        snprintf(buf, len, NIPQUAD_FMT,
+                        snprintf(buf, len, "%pI4",
-                                 NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
+                                 &sin6->sin6_addr.s6_addr32[3]);
                else
-                        snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
+                        snprintf(buf, len, "%pI6", &sin6->sin6_addr);
                break;
        default:
                snprintf(buf, len, "unsupported address family");
@@ -222,6 +223,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        host->h_nsmstate   = 0;                 /* real NSM state */
        host->h_nsmhandle  = nsm;
        host->h_server     = ni->server;
+        host->h_noresvport = ni->noresvport;
        hlist_add_head(&host->h_hash, chain);
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
@@ -272,6 +274,7 @@ nlm_destroy_host(struct nlm_host *host)
 * @protocol: transport protocol to use
 * @version: NLM protocol version
 * @hostname: '\0'-terminated hostname of server
+ * @noresvport: 1 if non-privileged port should be used
 *
 * Returns an nlm_host structure that matches the passed-in
 * [server address, transport protocol, NLM version, server hostname].
@@ -281,7 +284,9 @@ nlm_destroy_host(struct nlm_host *host)
 struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const size_t salen,
                                     const unsigned short protocol,
-                                     const u32 version, const char *hostname)
+                                     const u32 version,
+                                     const char *hostname,
+                                     int noresvport)
 {
        const struct sockaddr source = {
                .sa_family      = AF_UNSPEC,
@@ -296,6 +301,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .hostname_len   = strlen(hostname),
                .src_sap        = &source,
                .src_len        = sizeof(source),
+                .noresvport     = noresvport,
        };
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
@@ -417,6 +423,8 @@ nlm_bind_host(struct nlm_host *host)
                 */
                if (!host->h_server)
                        args.flags |= RPC_CLNT_CREATE_HARDRTRY;
+                if (host->h_noresvport)
+                        args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
                clnt = rpc_create(&args);
                if (!IS_ERR(clnt))
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 4e7e958e8f6..ffd3461f75e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -179,7 +179,7 @@ static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
        if (!nsm_use_hostnames) {
                snprintf(buffer, XDR_ADDRBUF_LEN,
-                         NIPQUAD_FMT, NIPQUAD(argp->addr));
+                         "%pI4", &argp->addr);
                name = buffer;
        }
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 56b076736b5..252d80163d0 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -45,7 +45,7 @@
 static struct svc_program       nlmsvc_program;
 struct nlmsvc_binding *         nlmsvc_ops;
-EXPORT_SYMBOL(nlmsvc_ops);
+EXPORT_SYMBOL_GPL(nlmsvc_ops);
 static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int             nlmsvc_users;
@@ -300,7 +300,7 @@ out:
        mutex_unlock(&nlmsvc_mutex);
        return error;
 }
-EXPORT_SYMBOL(lockd_up);
+EXPORT_SYMBOL_GPL(lockd_up);
 /*
 * Decrement the user count and bring down lockd if we're the last.
@@ -329,7 +329,7 @@ lockd_down(void)
 out:
        mutex_unlock(&nlmsvc_mutex);
 }
-EXPORT_SYMBOL(lockd_down);
+EXPORT_SYMBOL_GPL(lockd_down);
 #ifdef CONFIG_SYSCTL
diff --git a/fs/locks.c b/fs/locks.c
index 09062e3ff10..46a2e12f7d4 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1349,7 +1349,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
        struct inode *inode = dentry->d_inode;
        int error, rdlease_count = 0, wrlease_count = 0;
-        if ((current->fsuid != inode->i_uid) && !capable(CAP_LEASE))
+        if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE))
                return -EACCES;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 703cc35e04b..3aebe322271 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -262,8 +262,8 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
                iput(inode);
                return NULL;
        }
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
+        inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
        inode->i_ino = j;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
diff --git a/fs/namei.c b/fs/namei.c
index d34e0f9681c..df2d3df4f04 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -186,7 +186,7 @@ int generic_permission(struct inode *inode, int mask,
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
-        if (current->fsuid == inode->i_uid)
+        if (current_fsuid() == inode->i_uid)
                mode >>= 6;
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
@@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask,
        return -EACCES;
 }
+/**
+ * inode_permission  -  check for access rights to a given inode
+ * @inode:      inode to check permission on
+ * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
 int inode_permission(struct inode *inode, int mask)
 {
        int retval;
@@ -247,7 +257,6 @@ int inode_permission(struct inode *inode, int mask)
                        return -EACCES;
        }
-        /* Ordinary permission routines do not understand MAY_APPEND. */
        if (inode->i_op && inode->i_op->permission)
                retval = inode->i_op->permission(inode, mask);
        else
@@ -265,21 +274,6 @@ int inode_permission(struct inode *inode, int mask)
 }
 /**
- * vfs_permission  -  check for access rights to a given path
- * @nd:         lookup result that describes the path
- * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on a path.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
- */
-int vfs_permission(struct nameidata *nd, int mask)
-{
-        return inode_permission(nd->path.dentry->d_inode, mask);
-}
-/**
 * file_permission  -  check for additional access rights to a given file
 * @file:       file to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
@@ -289,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask)
 *
 * Note:
 *      Do not use this function in new code.  All access checks should
- *      be done using vfs_permission().
+ *      be done using inode_permission().
 */
 int file_permission(struct file *file, int mask)
 {
@@ -441,7 +435,7 @@ static int exec_permission_lite(struct inode *inode)
        if (inode->i_op && inode->i_op->permission)
                return -EAGAIN;
-        if (current->fsuid == inode->i_uid)
+        if (current_fsuid() == inode->i_uid)
                mode >>= 6;
        else if (in_group_p(inode->i_gid))
                mode >>= 3;
@@ -527,18 +521,6 @@ out_unlock:
        return result;
 }
-/* SMP-safe */
-static __always_inline void
-walk_init_root(const char *name, struct nameidata *nd)
-{
-        struct fs_struct *fs = current->fs;
-        read_lock(&fs->lock);
-        nd->path = fs->root;
-        path_get(&fs->root);
-        read_unlock(&fs->lock);
-}
 /*
 * Wrapper to retry pathname resolution whenever the underlying
 * file system returns an ESTALE.
@@ -576,9 +558,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                goto fail;
        if (*link == '/') {
+                struct fs_struct *fs = current->fs;
                path_put(&nd->path);
-                walk_init_root(link, nd);
+                read_lock(&fs->lock);
+                nd->path = fs->root;
+                path_get(&fs->root);
+                read_unlock(&fs->lock);
        }
        res = link_path_walk(link, nd);
        if (nd->depth || res || nd->last_type!=LAST_NORM)
                return res;
@@ -859,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                nd->flags |= LOOKUP_CONTINUE;
                err = exec_permission_lite(inode);
                if (err == -EAGAIN)
-                        err = vfs_permission(nd, MAY_EXEC);
+                        err = inode_permission(nd->path.dentry->d_inode,
+                                               MAY_EXEC);
                if (err)
                        break;
@@ -1334,11 +1324,13 @@ static int user_path_parent(int dfd, const char __user *path,
 */
 static inline int check_sticky(struct inode *dir, struct inode *inode)
 {
+        uid_t fsuid = current_fsuid();
        if (!(dir->i_mode & S_ISVTX))
                return 0;
-        if (inode->i_uid == current->fsuid)
+        if (inode->i_uid == fsuid)
                return 0;
-        if (dir->i_uid == current->fsuid)
+        if (dir->i_uid == fsuid)
                return 0;
        return !capable(CAP_FOWNER);
 }
@@ -1491,9 +1483,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
-int may_open(struct nameidata *nd, int acc_mode, int flag)
+int may_open(struct path *path, int acc_mode, int flag)
 {
-        struct dentry *dentry = nd->path.dentry;
+        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
@@ -1514,13 +1506,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
        if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                flag &= ~O_TRUNC;
        } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
-                if (nd->path.mnt->mnt_flags & MNT_NODEV)
+                if (path->mnt->mnt_flags & MNT_NODEV)
                        return -EACCES;
                flag &= ~O_TRUNC;
        }
-        error = vfs_permission(nd, acc_mode);
+        error = inode_permission(inode, acc_mode);
        if (error)
                return error;
        /*
@@ -1554,6 +1546,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
                 * Refuse to truncate files with mandatory locks held on them.
                 */
                error = locks_verify_locked(inode);
+                if (!error)
+                        error = security_path_truncate(path, 0,
+                                               ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
                if (!error) {
                        DQUOT_INIT(inode);
@@ -1584,14 +1579,18 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
        if (!IS_POSIXACL(dir->d_inode))
                mode &= ~current->fs->umask;
+        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
+        if (error)
+                goto out_unlock;
        error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(nd->path.dentry);
        nd->path.dentry = path->dentry;
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
-        return may_open(nd, 0, flag & ~O_TRUNC);
+        return may_open(&nd->path, 0, flag & ~O_TRUNC);
 }
 /*
@@ -1777,7 +1776,7 @@ ok:
                if (error)
                        goto exit;
        }
-        error = may_open(&nd, acc_mode, flag);
+        error = may_open(&nd.path, acc_mode, flag);
        if (error) {
                if (will_write)
                        mnt_drop_write(nd.path.mnt);
@@ -1997,6 +1996,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_mknod(&nd.path, dentry, mode, dev);
+        if (error)
+                goto out_drop_write;
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2009,6 +2011,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
                        error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
                        break;
        }
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2068,7 +2071,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_mkdir(&nd.path, dentry, mode);
+        if (error)
+                goto out_drop_write;
        error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2178,7 +2185,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto exit3;
+        error = security_path_rmdir(&nd.path, dentry);
+        if (error)
+                goto exit4;
        error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
        mnt_drop_write(nd.path.mnt);
 exit3:
        dput(dentry);
@@ -2263,7 +2274,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
+                error = security_path_unlink(&nd.path, dentry);
+                if (error)
+                        goto exit3;
                error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
                mnt_drop_write(nd.path.mnt);
        exit2:
                dput(dentry);
@@ -2344,7 +2359,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_symlink(&nd.path, dentry, from);
+        if (error)
+                goto out_drop_write;
        error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2441,7 +2460,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+        if (error)
+                goto out_drop_write;
        error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(new_dentry);
@@ -2677,8 +2700,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
        error = mnt_want_write(oldnd.path.mnt);
        if (error)
                goto exit5;
+        error = security_path_rename(&oldnd.path, old_dentry,
+                                     &newnd.path, new_dentry);
+        if (error)
+                goto exit6;
        error = vfs_rename(old_dir->d_inode, old_dentry,
                                   new_dir->d_inode, new_dentry);
+exit6:
        mnt_drop_write(oldnd.path.mnt);
 exit5:
        dput(new_dentry);
@@ -2748,13 +2776,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
-        struct page * page;
+        char *kaddr;
+        struct page *page;
        struct address_space *mapping = dentry->d_inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
                return (char*)page;
        *ppage = page;
-        return kmap(page);
+        kaddr = kmap(page);
+        nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+        return kaddr;
 }
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
@@ -2786,18 +2817,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
        }
 }
-int __page_symlink(struct inode *inode, const char *symname, int len,
+/*
-                gfp_t gfp_mask)
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        void *fsdata;
        int err;
        char *kaddr;
+        unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+        if (nofs)
+                flags |= AOP_FLAG_NOFS;
 retry:
        err = pagecache_write_begin(NULL, mapping, 0, len-1,
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+                                flags, &page, &fsdata);
        if (err)
                goto fail;
@@ -2821,7 +2857,7 @@ fail:
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
        return __page_symlink(inode, symname, len,
-                        mapping_gfp_mask(inode->i_mapping));
+                        !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
 const struct inode_operations page_symlink_inode_operations = {
@@ -2847,7 +2883,6 @@ EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
@@ -2863,3 +2898,10 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+        .count          = ATOMIC_INIT(1),
+        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .umask          = 0022,
+};
diff --git a/fs/namespace.c b/fs/namespace.c
index 65b3dc844c8..a40685d800a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1176,7 +1176,7 @@ static int mount_is_safe(struct path *path)
        if (S_ISLNK(path->dentry->d_inode->i_mode))
                return -EPERM;
        if (path->dentry->d_inode->i_mode & S_ISVTX) {
-                if (current->uid != path->dentry->d_inode->i_uid)
+                if (current_uid() != path->dentry->d_inode->i_uid)
                        return -EPERM;
        }
        if (inode_permission(path->dentry->d_inode, MAY_WRITE))
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        if (!new_ns->root) {
                up_write(&namespace_sem);
                kfree(new_ns);
-                return ERR_PTR(-ENOMEM);;
+                return ERR_PTR(-ENOMEM);
        }
        spin_lock(&vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 3a97c95e1ca..6d04e050c74 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -40,10 +40,10 @@ ncp_get_fs_info(struct ncp_server * server, struct file *file,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ncp_fs_info info;
-        if ((file_permission(file, MAY_WRITE) != 0)
+        if (file_permission(file, MAY_WRITE) != 0
-            && (current->uid != server->m.mounted_uid)) {
+            && current_uid() != server->m.mounted_uid)
                return -EACCES;
-        }
        if (copy_from_user(&info, arg, sizeof(info)))
                return -EFAULT;
@@ -70,10 +70,10 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct file *file,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ncp_fs_info_v2 info2;
-        if ((file_permission(file, MAY_WRITE) != 0)
+        if (file_permission(file, MAY_WRITE) != 0
-            && (current->uid != server->m.mounted_uid)) {
+            && current_uid() != server->m.mounted_uid)
                return -EACCES;
-        }
        if (copy_from_user(&info2, arg, sizeof(info2)))
                return -EFAULT;
@@ -141,10 +141,10 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct compat_ncp_fs_info_v2 info2;
-        if ((file_permission(file, MAY_WRITE) != 0)
+        if (file_permission(file, MAY_WRITE) != 0
-            && (current->uid != server->m.mounted_uid)) {
+            && current_uid() != server->m.mounted_uid)
                return -EACCES;
-        }
        if (copy_from_user(&info2, arg, sizeof(info2)))
                return -EFAULT;
@@ -270,16 +270,17 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
        struct ncp_ioctl_request request;
        char* bouncebuffer;
        void __user *argp = (void __user *)arg;
+        uid_t uid = current_uid();
        switch (cmd) {
 #ifdef CONFIG_COMPAT
        case NCP_IOC_NCPREQUEST_32:
 #endif
        case NCP_IOC_NCPREQUEST:
-                if ((file_permission(filp, MAY_WRITE) != 0)
+                if (file_permission(filp, MAY_WRITE) != 0
-                    && (current->uid != server->m.mounted_uid)) {
+                    && uid != server->m.mounted_uid)
                        return -EACCES;
-                }
 #ifdef CONFIG_COMPAT
                if (cmd == NCP_IOC_NCPREQUEST_32) {
                        struct compat_ncp_ioctl_request request32;
@@ -356,10 +357,10 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
        case NCP_IOC_GETMOUNTUID16:
        case NCP_IOC_GETMOUNTUID32:
        case NCP_IOC_GETMOUNTUID64:
-                if ((file_permission(filp, MAY_READ) != 0)
+                if (file_permission(filp, MAY_READ) != 0
-                        && (current->uid != server->m.mounted_uid)) {
+                        && uid != server->m.mounted_uid)
                        return -EACCES;
-                }
                if (cmd == NCP_IOC_GETMOUNTUID16) {
                        u16 uid;
                        SET_UID(uid, server->m.mounted_uid);
@@ -380,11 +381,10 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
                {
                        struct ncp_setroot_ioctl sr;
-                        if ((file_permission(filp, MAY_READ) != 0)
+                        if (file_permission(filp, MAY_READ) != 0
-                            && (current->uid != server->m.mounted_uid))
+                            && uid != server->m.mounted_uid)
-                        {
                                return -EACCES;
-                        }
                        if (server->m.mounted_vol[0]) {
                                struct dentry* dentry = inode->i_sb->s_root;
@@ -408,6 +408,7 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
                                return -EFAULT;
                        return 0;
                }
        case NCP_IOC_SETROOT:
                {
                        struct ncp_setroot_ioctl sr;
@@ -455,11 +456,10 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
 #ifdef CONFIG_NCPFS_PACKET_SIGNING      
        case NCP_IOC_SIGN_INIT:
-                if ((file_permission(filp, MAY_WRITE) != 0)
+                if (file_permission(filp, MAY_WRITE) != 0
-                    && (current->uid != server->m.mounted_uid))
+                    && uid != server->m.mounted_uid)
-                {
                        return -EACCES;
-                }
                if (argp) {
                        if (server->sign_wanted)
                        {
@@ -478,24 +478,22 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
                return 0;               
                
        case NCP_IOC_SIGN_WANTED:
-                if ((file_permission(filp, MAY_READ) != 0)
+                if (file_permission(filp, MAY_READ) != 0
-                    && (current->uid != server->m.mounted_uid))
+                    && uid != server->m.mounted_uid)
-                {
                        return -EACCES;
-                }
                
                if (put_user(server->sign_wanted, (int __user *)argp))
                        return -EFAULT;
                return 0;
        case NCP_IOC_SET_SIGN_WANTED:
                {
                        int newstate;
-                        if ((file_permission(filp, MAY_WRITE) != 0)
+                        if (file_permission(filp, MAY_WRITE) != 0
-                            && (current->uid != server->m.mounted_uid))
+                            && uid != server->m.mounted_uid)
-                        {
                                return -EACCES;
-                        }
                        /* get only low 8 bits... */
                        if (get_user(newstate, (unsigned char __user *)argp))
                                return -EFAULT;
@@ -512,11 +510,10 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
 #ifdef CONFIG_NCPFS_IOCTL_LOCKING
        case NCP_IOC_LOCKUNLOCK:
-                if ((file_permission(filp, MAY_WRITE) != 0)
+                if (file_permission(filp, MAY_WRITE) != 0
-                    && (current->uid != server->m.mounted_uid))
+                    && uid != server->m.mounted_uid)
-                {
                        return -EACCES;
-                }
                {
                        struct ncp_lock_ioctl    rqdata;
@@ -585,9 +582,8 @@ outrel:
 #ifdef CONFIG_COMPAT
        case NCP_IOC_GETOBJECTNAME_32:
-                if (current->uid != server->m.mounted_uid) {
+                if (uid != server->m.mounted_uid)
                        return -EACCES;
-                }
                {
                        struct compat_ncp_objectname_ioctl user;
                        size_t outl;
@@ -609,10 +605,10 @@ outrel:
                        return 0;
                }
 #endif
        case NCP_IOC_GETOBJECTNAME:
-                if (current->uid != server->m.mounted_uid) {
+                if (uid != server->m.mounted_uid)
                        return -EACCES;
-                }
                {
                        struct ncp_objectname_ioctl user;
                        size_t outl;
@@ -633,13 +629,13 @@ outrel:
                                return -EFAULT;
                        return 0;
                }
 #ifdef CONFIG_COMPAT
        case NCP_IOC_SETOBJECTNAME_32:
 #endif
        case NCP_IOC_SETOBJECTNAME:
-                if (current->uid != server->m.mounted_uid) {
+                if (uid != server->m.mounted_uid)
                        return -EACCES;
-                }
                {
                        struct ncp_objectname_ioctl user;
                        void* newname;
@@ -691,13 +687,13 @@ outrel:
                        kfree(oldname);
                        return 0;
                }
 #ifdef CONFIG_COMPAT
        case NCP_IOC_GETPRIVATEDATA_32:
 #endif
        case NCP_IOC_GETPRIVATEDATA:
-                if (current->uid != server->m.mounted_uid) {
+                if (uid != server->m.mounted_uid)
                        return -EACCES;
-                }
                {
                        struct ncp_privatedata_ioctl user;
                        size_t outl;
@@ -736,13 +732,13 @@ outrel:
                        return 0;
                }
 #ifdef CONFIG_COMPAT
        case NCP_IOC_SETPRIVATEDATA_32:
 #endif
        case NCP_IOC_SETPRIVATEDATA:
-                if (current->uid != server->m.mounted_uid) {
+                if (uid != server->m.mounted_uid)
                        return -EACCES;
-                }
                {
                        struct ncp_privatedata_ioctl user;
                        void* new;
@@ -794,9 +790,10 @@ outrel:
 #endif /* CONFIG_NCPFS_NLS */
        case NCP_IOC_SETDENTRYTTL:
-                if ((file_permission(filp, MAY_WRITE) != 0) &&
+                if (file_permission(filp, MAY_WRITE) != 0 &&
-                                 (current->uid != server->m.mounted_uid))
+                    uid != server->m.mounted_uid)
                        return -EACCES;
                {
                        u_int32_t user;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c2e9cfd9e5a..3e634f2a108 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,6 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #include <net/inet_sock.h>
@@ -182,10 +183,34 @@ void nfs_callback_down(void)
        mutex_unlock(&nfs_callback_mutex);
 }
+static int check_gss_callback_principal(struct nfs_client *clp,
+                                        struct svc_rqst *rqstp)
+{
+        struct rpc_clnt *r = clp->cl_rpcclient;
+        char *p = svc_gss_principal(rqstp);
+        /*
+         * It might just be a normal user principal, in which case
+         * userspace won't bother to tell us the name at all.
+         */
+        if (p == NULL)
+                return SVC_DENIED;
+        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+        if (memcmp(p, "nfs@", 4) != 0)
+                return SVC_DENIED;
+        p += 4;
+        if (strcmp(p, r->cl_server) != 0)
+                return SVC_DENIED;
+        return SVC_OK;
+}
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
        struct nfs_client *clp;
        RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+        int ret = SVC_OK;
        /* Don't talk to strangers */
        clp = nfs_find_client(svc_addr(rqstp), 4);
@@ -194,21 +219,22 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        dprintk("%s: %s NFSv4 callback!\n", __func__,
                        svc_print_addr(rqstp, buf, sizeof(buf)));
-        nfs_put_client(clp);
        switch (rqstp->rq_authop->flavour) {
                case RPC_AUTH_NULL:
                        if (rqstp->rq_proc != CB_NULL)
-                                return SVC_DENIED;
+                                ret = SVC_DENIED;
                        break;
                case RPC_AUTH_UNIX:
                        break;
                case RPC_AUTH_GSS:
-                        /* FIXME: RPCSEC_GSS handling? */
+                        ret = check_gss_callback_principal(clp, rqstp);
+                        break;
                default:
-                        return SVC_DENIED;
+                        ret = SVC_DENIED;
        }
-        return SVC_OK;
+        nfs_put_client(clp);
+        return ret;
 }
 /*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 7547600b617..9b728f3565a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        init_rwsem(&clp->cl_sem);
        INIT_LIST_HEAD(&clp->cl_delegations);
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
@@ -224,31 +223,54 @@ void nfs_put_client(struct nfs_client *clp)
        }
 }
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-                                 const struct sockaddr_in *sa2)
+static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
 {
-        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+        switch (sa->sa_family) {
+                default:
+                        return NULL;
+                case AF_INET6:
+                        return &((const struct sockaddr_in6 *)sa)->sin6_addr;
+                        break;
+                case AF_INET:
+                        ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
+                                        addr_mapped);
+                        return addr_mapped;
+        }
 }
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-                                 const struct sockaddr_in6 *sa2)
+                const struct sockaddr *sa2)
+{
+        const struct in6_addr *addr1;
+        const struct in6_addr *addr2;
+        struct in6_addr addr1_mapped;
+        struct in6_addr addr2_mapped;
+        addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
+        if (likely(addr1 != NULL)) {
+                addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
+                if (likely(addr2 != NULL))
+                        return ipv6_addr_equal(addr1, addr2);
+        }
+        return 0;
+}
+#else
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+                                 const struct sockaddr_in *sa2)
 {
-        return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
+        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
 }
 static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
                                 const struct sockaddr *sa2)
 {
-        switch (sa1->sa_family) {
+        if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
-        case AF_INET:
+                return 0;
-                return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+        return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
-                                (const struct sockaddr_in *)sa2);
+                        (const struct sockaddr_in *)sa2);
-        case AF_INET6:
-                return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
-                                (const struct sockaddr_in6 *)sa2);
-        }
-        BUG();
 }
+#endif
 /*
 * Find a client by IP address and protocol version
@@ -270,8 +292,6 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
                if (clp->rpc_ops->version != nfsversion)
                        continue;
-                if (addr->sa_family != clap->sa_family)
-                        continue;
                /* Match only the IP address, not the port number */
                if (!nfs_sockaddr_match_ipaddr(addr, clap))
                        continue;
@@ -305,8 +325,6 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
                if (clp->rpc_ops->version != nfsvers)
                        continue;
-                if (sap->sa_family != clap->sa_family)
-                        continue;
                /* Match only the IP address, not the port number */
                if (!nfs_sockaddr_match_ipaddr(sap, clap))
                        continue;
@@ -470,7 +488,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 static int nfs_create_rpc_client(struct nfs_client *clp,
                                 const struct rpc_timeout *timeparms,
                                 rpc_authflavor_t flavor,
-                                 int flags)
+                                 int discrtry, int noresvport)
 {
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
@@ -482,9 +500,13 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
                .program        = &nfs_program,
                .version        = clp->rpc_ops->version,
                .authflavor     = flavor,
-                .flags          = flags,
        };
+        if (discrtry)
+                args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+        if (noresvport)
+                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        if (!IS_ERR(clp->cl_rpcclient))
                return 0;
@@ -522,6 +544,8 @@ static int nfs_start_lockd(struct nfs_server *server)
                .protocol       = server->flags & NFS_MOUNT_TCP ?
                                                IPPROTO_TCP : IPPROTO_UDP,
                .nfs_version    = clp->rpc_ops->version,
+                .noresvport     = server->flags & NFS_MOUNT_NORESVPORT ?
+                                        1 : 0,
        };
        if (nlm_init.nfs_version > 3)
@@ -623,7 +647,8 @@ static int nfs_init_client(struct nfs_client *clp,
         * Create a client RPC handle for doing FSSTAT with UNIX auth only
         * - RFC 2623, sec 2.3.2
         */
-        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
+        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
+                                      0, data->flags & NFS_MOUNT_NORESVPORT);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -965,7 +990,8 @@ error:
 static int nfs4_init_client(struct nfs_client *clp,
                const struct rpc_timeout *timeparms,
                const char *ip_addr,
-                rpc_authflavor_t authflavour)
+                rpc_authflavor_t authflavour,
+                int flags)
 {
        int error;
@@ -979,7 +1005,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        clp->rpc_ops = &nfs_v4_clientops;
        error = nfs_create_rpc_client(clp, timeparms, authflavour,
-                                        RPC_CLNT_CREATE_DISCRTRY);
+                                      1, flags & NFS_MOUNT_NORESVPORT);
        if (error < 0)
                goto error;
        memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1030,7 +1056,8 @@ static int nfs4_set_client(struct nfs_server *server,
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
+        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
+                                        server->flags);
        if (error < 0)
                goto error_put;
@@ -1059,6 +1086,10 @@ static int nfs4_init_server(struct nfs_server *server,
        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
                        data->timeo, data->retrans);
+        /* Initialise the client representation from the mount data */
+        server->flags = data->flags;
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
        /* Get a client record */
        error = nfs4_set_client(server,
                        data->nfs_server.hostname,
@@ -1071,10 +1102,6 @@ static int nfs4_init_server(struct nfs_server *server,
        if (error < 0)
                goto error;
-        /* Initialise the client representation from the mount data */
-        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
        if (data->wsize)
@@ -1177,6 +1204,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        parent_server = NFS_SB(data->sb);
        parent_client = parent_server->nfs_client;
+        /* Initialise the client representation from the parent server */
+        nfs_server_copy_userdata(server, parent_server);
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
        /* Get a client representation.
         * Note: NFSv4 always uses TCP, */
        error = nfs4_set_client(server, data->hostname,
@@ -1189,10 +1220,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        if (error < 0)
                goto error;
-        /* Initialise the client representation from the parent server */
-        nfs_server_copy_userdata(server, parent_server);
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
        error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
        if (error < 0)
                goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cc563cfa694..968225a8801 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -43,6 +43,27 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
                put_rpccred(cred);
 }
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+        struct nfs_delegation *delegation;
+        int ret = 0;
+        flags &= FMODE_READ|FMODE_WRITE;
+        rcu_read_lock();
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        if (delegation != NULL && (delegation->type & flags) == flags) {
+                nfs_mark_delegation_referenced(delegation);
+                ret = 1;
+        }
+        rcu_read_unlock();
+        return ret;
+}
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
 {
        struct inode *inode = state->inode;
@@ -119,7 +140,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
        delegation->maxsize = res->maxsize;
        oldcred = delegation->cred;
        delegation->cred = get_rpccred(cred);
-        delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
+        clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
        NFS_I(inode)->delegation_state = delegation->type;
        smp_wmb();
        put_rpccred(oldcred);
@@ -134,19 +155,35 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
        return res;
 }
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+        struct inode *inode = NULL;
+        spin_lock(&delegation->lock);
+        if (delegation->inode != NULL)
+                inode = igrab(delegation->inode);
+        spin_unlock(&delegation->lock);
+        return inode;
+}
 static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
 {
        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
        if (delegation == NULL)
                goto nomatch;
+        spin_lock(&delegation->lock);
        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch;
+                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
+        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
+        spin_unlock(&delegation->lock);
        return delegation;
+nomatch_unlock:
+        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
@@ -172,6 +209,8 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        delegation->change_attr = nfsi->change_attr;
        delegation->cred = get_rpccred(cred);
        delegation->inode = inode;
+        delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+        spin_lock_init(&delegation->lock);
        spin_lock(&clp->cl_lock);
        if (rcu_dereference(nfsi->delegation) != NULL) {
@@ -226,22 +265,47 @@ static void nfs_msync_inode(struct inode *inode)
 */
 static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        nfs_msync_inode(inode);
-        down_read(&clp->cl_sem);
        /* Guard against new delegated open calls */
        down_write(&nfsi->rwsem);
        nfs_delegation_claim_opens(inode, &delegation->stateid);
        up_write(&nfsi->rwsem);
-        up_read(&clp->cl_sem);
        nfs_msync_inode(inode);
        return nfs_do_return_delegation(inode, delegation, 1);
 }
 /*
+ * Return all delegations that have been marked for return
+ */
+void nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+        struct nfs_delegation *delegation;
+        struct inode *inode;
+restart:
+        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                        continue;
+                inode = nfs_delegation_grab_inode(delegation);
+                if (inode == NULL)
+                        continue;
+                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                spin_unlock(&clp->cl_lock);
+                rcu_read_unlock();
+                if (delegation != NULL)
+                        __nfs_inode_return_delegation(inode, delegation);
+                iput(inode);
+                goto restart;
+        }
+        rcu_read_unlock();
+}
+/*
 * This function returns the delegation without reclaiming opens
 * or protecting against delegation reclaims.
 * It is therefore really only safe to be called from
@@ -279,83 +343,55 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
+static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+{
+        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
 /*
 * Return all delegations associated to a super block
 */
-void nfs_return_all_delegations(struct super_block *sb)
+void nfs_super_return_all_delegations(struct super_block *sb)
 {
        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
        struct nfs_delegation *delegation;
-        struct inode *inode;
        if (clp == NULL)
                return;
-restart:
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if (delegation->inode->i_sb != sb)
+                spin_lock(&delegation->lock);
-                        continue;
+                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
-                inode = igrab(delegation->inode);
+                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                if (inode == NULL)
+                spin_unlock(&delegation->lock);
-                        continue;
-                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation != NULL)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
+        nfs_client_return_marked_delegations(clp);
 }
-static int nfs_do_expire_all_delegations(void *ptr)
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
 {
-        struct nfs_client *clp = ptr;
        struct nfs_delegation *delegation;
-        struct inode *inode;
-        allow_signal(SIGKILL);
-restart:
-        if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
-                goto out;
-        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
-                goto out;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                inode = igrab(delegation->inode);
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                if (inode == NULL)
+                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                        continue;
-                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
-out:
+}
-        nfs_put_client(clp);
-        module_put_and_exit(0);
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+        if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+                nfs4_schedule_state_manager(clp);
 }
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
-        struct task_struct *task;
+        nfs_client_mark_return_all_delegations(clp);
+        nfs_delegation_run_state_manager(clp);
-        __module_get(THIS_MODULE);
-        atomic_inc(&clp->cl_count);
-        task = kthread_run(nfs_do_expire_all_delegations, clp,
-                                "%s-delegreturn",
-                                rpc_peeraddr2str(clp->cl_rpcclient,
-                                                        RPC_DISPLAY_ADDR));
-        if (!IS_ERR(task))
-                return;
-        nfs_put_client(clp);
-        module_put(THIS_MODULE);
 }
 /*
@@ -363,68 +399,29 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
-        struct inode *inode;
        if (clp == NULL)
                return;
-restart:
+        nfs_client_mark_return_all_delegations(clp);
+}
+static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+{
+        struct nfs_delegation *delegation;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                inode = igrab(delegation->inode);
+                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
-                if (inode == NULL)
                        continue;
-                spin_lock(&clp->cl_lock);
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation != NULL)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
 }
-struct recall_threadargs {
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
-        struct inode *inode;
-        struct nfs_client *clp;
-        const nfs4_stateid *stateid;
-        struct completion started;
-        int result;
-};
-static int recall_thread(void *data)
 {
-        struct recall_threadargs *args = (struct recall_threadargs *)data;
+        nfs_client_mark_return_unreferenced_delegations(clp);
-        struct inode *inode = igrab(args->inode);
+        nfs_delegation_run_state_manager(clp);
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_delegation *delegation;
-        daemonize("nfsv4-delegreturn");
-        nfs_msync_inode(inode);
-        down_read(&clp->cl_sem);
-        down_write(&nfsi->rwsem);
-        spin_lock(&clp->cl_lock);
-        delegation = nfs_detach_delegation_locked(nfsi, args->stateid);
-        if (delegation != NULL)
-                args->result = 0;
-        else
-                args->result = -ENOENT;
-        spin_unlock(&clp->cl_lock);
-        complete(&args->started);
-        nfs_delegation_claim_opens(inode, args->stateid);
-        up_write(&nfsi->rwsem);
-        up_read(&clp->cl_sem);
-        nfs_msync_inode(inode);
-        if (delegation != NULL)
-                nfs_do_return_delegation(inode, delegation, 1);
-        iput(inode);
-        module_put_and_exit(0);
 }
 /*
@@ -432,22 +429,20 @@ static int recall_thread(void *data)
 */
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
 {
-        struct recall_threadargs data = {
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
-                .inode = inode,
+        struct nfs_delegation *delegation;
-                .stateid = stateid,
-        };
-        int status;
-        init_completion(&data.started);
+        rcu_read_lock();
-        __module_get(THIS_MODULE);
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        status = kernel_thread(recall_thread, &data, CLONE_KERNEL);
+        if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
-        if (status < 0)
+                                sizeof(delegation->stateid.data)) != 0) {
-                goto out_module_put;
+                rcu_read_unlock();
-        wait_for_completion(&data.started);
+                return -ENOENT;
-        return data.result;
+        }
-out_module_put:
+        nfs_mark_return_delegation(clp, delegation);
-        module_put(THIS_MODULE);
+        rcu_read_unlock();
-        return status;
+        nfs_delegation_run_state_manager(clp);
+        return 0;
 }
 /*
@@ -459,10 +454,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
        struct inode *res = NULL;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+                spin_lock(&delegation->lock);
+                if (delegation->inode != NULL &&
+                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
                        res = igrab(delegation->inode);
-                        break;
                }
+                spin_unlock(&delegation->lock);
+                if (res != NULL)
+                        break;
        }
        rcu_read_unlock();
        return res;
@@ -476,7 +475,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
        struct nfs_delegation *delegation;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
-                delegation->flags |= NFS_DELEGATION_NEED_RECLAIM;
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
        rcu_read_unlock();
 }
@@ -486,17 +485,22 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct inode *inode;
 restart:
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0)
+                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                        continue;
+                inode = nfs_delegation_grab_inode(delegation);
+                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL)
                        nfs_free_delegation(delegation);
+                iput(inode);
                goto restart;
        }
        rcu_read_unlock();
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index f1c5e2a5d88..09f38379517 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -17,14 +17,20 @@ struct nfs_delegation {
        struct rpc_cred *cred;
        struct inode *inode;
        nfs4_stateid stateid;
-        int type;
+        fmode_t type;
-#define NFS_DELEGATION_NEED_RECLAIM 1
-        long flags;
        loff_t maxsize;
        __u64 change_attr;
+        unsigned long flags;
+        spinlock_t lock;
        struct rcu_head rcu;
 };
+enum {
+        NFS_DELEGATION_NEED_RECLAIM = 0,
+        NFS_DELEGATION_RETURN,
+        NFS_DELEGATION_REFERENCED,
+};
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
@@ -32,9 +38,11 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
-void nfs_return_all_delegations(struct super_block *sb);
+void nfs_super_return_all_delegations(struct super_block *sb);
 void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
+void nfs_client_return_marked_delegations(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -45,22 +53,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
-{
+int nfs_have_delegation(struct inode *inode, fmode_t flags);
-        struct nfs_delegation *delegation;
-        int ret = 0;
-        flags &= FMODE_READ|FMODE_WRITE;
-        rcu_read_lock();
-        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation != NULL && (delegation->type & flags) == flags)
-                ret = 1;
-        rcu_read_unlock();
-        return ret;
-}
 #else
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        return 0;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3e64b98f3a9..e35c8199f82 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -799,6 +799,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
                goto out_bad;
        }
+        if (nfs_have_delegation(inode, FMODE_READ))
+                goto out_set_verifier;
        /* Force a full look up iff the parent directory has changed */
        if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
                if (nfs_lookup_verify_inode(inode, nd))
@@ -817,6 +820,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
                goto out_bad;
+out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
        dput(parent);
@@ -973,7 +977,7 @@ struct dentry_operations nfs4_dentry_operations = {
 * Use intent information to determine whether we need to substitute
 * the NFSv4-style stateful OPEN for the LOOKUP call
 */
-static int is_atomic_open(struct inode *dir, struct nameidata *nd)
+static int is_atomic_open(struct nameidata *nd)
 {
        if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0)
                return 0;
@@ -996,7 +1000,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
        /* Check that we are indeed trying to open this file */
-        if (!is_atomic_open(dir, nd))
+        if (!is_atomic_open(nd))
                goto no_open;
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
@@ -1047,10 +1051,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct inode *dir;
        int openflags, ret = 0;
+        if (!is_atomic_open(nd))
+                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
-        if (!is_atomic_open(dir, nd))
-                goto no_open;
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
@@ -1062,11 +1066,11 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
-                goto no_open;
+                goto no_open_dput;
        openflags = nd->intent.open.flags;
        /* We cannot do exclusive creation on a positive dentry */
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
-                goto no_open;
+                goto no_open_dput;
        /* We can't create new files, or truncate existing ones here */
        openflags &= ~(O_CREAT|O_TRUNC);
@@ -1081,10 +1085,9 @@ out:
        if (!ret)
                d_drop(dentry);
        return ret;
-no_open:
+no_open_dput:
        dput(parent);
-        if (inode != NULL && nfs_have_delegation(inode, FMODE_READ))
+no_open:
-                return 1;
        return nfs_lookup_revalidate(dentry, nd);
 }
 #endif /* CONFIG_NFSV4 */
@@ -1794,7 +1797,8 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
        cache = nfs_access_search_rbtree(inode, cred);
        if (cache == NULL)
                goto out;
-        if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+        if (!nfs_have_delegation(inode, FMODE_READ) &&
+            !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
                goto out_stale;
        res->jiffies = cache->jiffies;
        res->cred = cache->cred;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f0..90f292b520d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d22eb383e1c..0c381686171 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -592,7 +592,7 @@ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context
 /*
 * Given an inode, search for an open context with the desired characteristics
 */
-struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode)
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_context *pos, *ctx = NULL;
@@ -712,14 +712,7 @@ int nfs_attribute_timeout(struct inode *inode)
        if (nfs_have_delegation(inode, FMODE_READ))
                return 0;
-        /*
+        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
-         * Special case: if the attribute timeout is set to 0, then always
-         *               treat the cache as having expired (unless holding
-         *               a delegation).
-         */
-        if (nfsi->attrtimeo == 0)
-                return 1;
-        return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 /**
@@ -1182,7 +1175,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                nfsi->attrtimeo_timestamp = now;
                nfsi->attr_gencount = nfs_inc_attr_generation_counter();
        } else {
-                if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+                if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
                        if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
                                nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
                        nfsi->attrtimeo_timestamp = now;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d212ee41caf..340ede8f608 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,20 @@ struct nfs_parsed_mount_data {
        struct security_mnt_opts lsm_opts;
 };
+/* mount_clnt.c */
+struct nfs_mount_request {
+        struct sockaddr         *sap;
+        size_t                  salen;
+        char                    *hostname;
+        char                    *dirpath;
+        u32                     version;
+        unsigned short          protocol;
+        struct nfs_fh           *fh;
+        int                     noresvport;
+};
+extern int nfs_mount(struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 086a6830d78..ca905a5bb1b 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -29,47 +29,43 @@ struct mnt_fhstatus {
 /**
 * nfs_mount - Obtain an NFS file handle for the given host and path
- * @addr: pointer to server's address
+ * @info: pointer to mount request arguments
- * @len: size of server's address
- * @hostname: name of server host, or NULL
- * @path: pointer to string containing export path to mount
- * @version: mount version to use for this request
- * @protocol: transport protocol to use for thie request
- * @fh: pointer to location to place returned file handle
 *
 * Uses default timeout parameters specified by underlying transport.
 */
-int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
+int nfs_mount(struct nfs_mount_request *info)
-              int version, int protocol, struct nfs_fh *fh)
 {
        struct mnt_fhstatus     result = {
-                .fh             = fh
+                .fh             = info->fh
        };
        struct rpc_message msg  = {
-                .rpc_argp       = path,
+                .rpc_argp       = info->dirpath,
                .rpc_resp       = &result,
        };
        struct rpc_create_args args = {
-                .protocol       = protocol,
+                .protocol       = info->protocol,
-                .address        = addr,
+                .address        = info->sap,
-                .addrsize       = len,
+                .addrsize       = info->salen,
-                .servername     = hostname,
+                .servername     = info->hostname,
                .program        = &mnt_program,
-                .version        = version,
+                .version        = info->version,
                .authflavor     = RPC_AUTH_UNIX,
-                .flags          = 0,
        };
        struct rpc_clnt         *mnt_clnt;
        int                     status;
        dprintk("NFS: sending MNT request for %s:%s\n",
-                (hostname ? hostname : "server"), path);
+                (info->hostname ? info->hostname : "server"),
+                        info->dirpath);
+        if (info->noresvport)
+                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        mnt_clnt = rpc_create(&args);
        if (IS_ERR(mnt_clnt))
                goto out_clnt_err;
-        if (version == NFS_MNT3_VERSION)
+        if (info->version == NFS_MNT3_VERSION)
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
        else
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea790645fda..4e4d3320437 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,8 +38,12 @@ struct idmap;
 ((err) != NFSERR_NOFILEHANDLE))
 enum nfs4_client_state {
-        NFS4CLNT_STATE_RECOVER  = 0,
+        NFS4CLNT_MANAGER_RUNNING  = 0,
+        NFS4CLNT_CHECK_LEASE,
        NFS4CLNT_LEASE_EXPIRED,
+        NFS4CLNT_RECLAIM_REBOOT,
+        NFS4CLNT_RECLAIM_NOGRACE,
+        NFS4CLNT_DELEGRETURN,
 };
 /*
@@ -90,12 +94,18 @@ struct nfs4_state_owner {
        spinlock_t           so_lock;
        atomic_t             so_count;
+        unsigned long        so_flags;
        struct list_head     so_states;
        struct list_head     so_delegations;
        struct nfs_seqid_counter so_seqid;
        struct rpc_sequence  so_sequence;
 };
+enum {
+        NFS_OWNER_RECLAIM_REBOOT,
+        NFS_OWNER_RECLAIM_NOGRACE
+};
 /*
 * struct nfs4_state maintains the client-side state for a given
 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -128,6 +138,8 @@ enum {
        NFS_O_RDONLY_STATE,             /* OPEN stateid has read-only state */
        NFS_O_WRONLY_STATE,             /* OPEN stateid has write-only state */
        NFS_O_RDWR_STATE,               /* OPEN stateid has read/write state */
+        NFS_STATE_RECLAIM_REBOOT,       /* OPEN stateid server rebooted */
+        NFS_STATE_RECLAIM_NOGRACE,      /* OPEN stateid needs to recover state */
 };
 struct nfs4_state {
@@ -149,7 +161,7 @@ struct nfs4_state {
        unsigned int n_rdonly;          /* Number of read-only references */
        unsigned int n_wronly;          /* Number of write-only references */
        unsigned int n_rdwr;            /* Number of read/write references */
-        int state;                      /* State on the server (R,W, or RW) */
+        fmode_t state;                  /* State on the server (R,W, or RW) */
        atomic_t count;
 };
@@ -157,9 +169,12 @@ struct nfs4_state {
 struct nfs4_exception {
        long timeout;
        int retry;
+        struct nfs4_state *state;
 };
 struct nfs4_state_recovery_ops {
+        int owner_flag_bit;
+        int state_flag_bit;
        int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 };
@@ -174,7 +189,6 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
-extern int nfs4_map_errors(int err);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
@@ -187,7 +201,7 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
-extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
+extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -202,16 +216,18 @@ extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
 /* nfs4state.c */
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
-extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
+extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
-extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t);
+extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
-extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 83e700a2b0c..8dde84b988d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -62,14 +62,12 @@
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 /* Prevent leaks of NFSv4 errors into userland */
-int nfs4_map_errors(int err)
+static int nfs4_map_errors(int err)
 {
        if (err < -1000) {
                dprintk("%s could not handle NFSv4 error %d\n",
@@ -195,6 +193,83 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start, KM_USER0);
 }
+static int nfs4_wait_bit_killable(void *word)
+{
+        if (fatal_signal_pending(current))
+                return -ERESTARTSYS;
+        schedule();
+        return 0;
+}
+static int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+        int res;
+        might_sleep();
+        res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+                        nfs4_wait_bit_killable, TASK_KILLABLE);
+        return res;
+}
+static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+{
+        int res = 0;
+        might_sleep();
+        if (*timeout <= 0)
+                *timeout = NFS4_POLL_RETRY_MIN;
+        if (*timeout > NFS4_POLL_RETRY_MAX)
+                *timeout = NFS4_POLL_RETRY_MAX;
+        schedule_timeout_killable(*timeout);
+        if (fatal_signal_pending(current))
+                res = -ERESTARTSYS;
+        *timeout <<= 1;
+        return res;
+}
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_state *state = exception->state;
+        int ret = errorcode;
+        exception->retry = 0;
+        switch(errorcode) {
+                case 0:
+                        return 0;
+                case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OPENMODE:
+                        if (state == NULL)
+                                break;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
+                case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_EXPIRED:
+                        nfs4_schedule_state_recovery(clp);
+                        ret = nfs4_wait_clnt_recover(clp);
+                        if (ret == 0)
+                                exception->retry = 1;
+                        break;
+                case -NFS4ERR_FILE_OPEN:
+                case -NFS4ERR_GRACE:
+                case -NFS4ERR_DELAY:
+                        ret = nfs4_delay(server->client, &exception->timeout);
+                        if (ret != 0)
+                                break;
+                case -NFS4ERR_OLD_STATEID:
+                        exception->retry = 1;
+        }
+        /* We failed to handle the error */
+        return nfs4_map_errors(ret);
+}
 static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -248,7 +323,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
-                struct nfs4_state_owner *sp, int flags,
+                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
                const struct iattr *attrs)
 {
        struct dentry *parent = dget_parent(path->dentry);
@@ -268,7 +343,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
        p->owner = sp;
        atomic_inc(&sp->so_count);
        p->o_arg.fh = NFS_FH(dir);
-        p->o_arg.open_flags = flags,
+        p->o_arg.open_flags = flags;
+        p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
        p->o_arg.clientid = server->nfs_client->cl_clientid;
        p->o_arg.id = sp->so_owner_id.id;
        p->o_arg.name = &p->path.dentry->d_name;
@@ -324,10 +400,13 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
        return ret;
 }
-static int can_open_cached(struct nfs4_state *state, int mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
 {
        int ret = 0;
-        switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) {
+        if (open_mode & O_EXCL)
+                goto out;
+        switch (mode & (FMODE_READ|FMODE_WRITE)) {
                case FMODE_READ:
                        ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
                        break;
@@ -337,21 +416,23 @@ static int can_open_cached(struct nfs4_state *state, int mode)
                case FMODE_READ|FMODE_WRITE:
                        ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
        }
+out:
        return ret;
 }
-static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 {
-        if ((delegation->type & open_flags) != open_flags)
+        if ((delegation->type & fmode) != fmode)
                return 0;
-        if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM)
+        if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
                return 0;
+        nfs_mark_delegation_referenced(delegation);
        return 1;
 }
-static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
 {
-        switch (open_flags) {
+        switch (fmode) {
                case FMODE_WRITE:
                        state->n_wronly++;
                        break;
@@ -361,15 +442,15 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
                case FMODE_READ|FMODE_WRITE:
                        state->n_rdwr++;
        }
-        nfs4_state_set_mode_locked(state, state->state | open_flags);
+        nfs4_state_set_mode_locked(state, state->state | fmode);
 }
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
        memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
-        switch (open_flags) {
+        switch (fmode) {
                case FMODE_READ:
                        set_bit(NFS_O_RDONLY_STATE, &state->flags);
                        break;
@@ -381,16 +462,15 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
        }
 }
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
        write_seqlock(&state->seqlock);
-        nfs_set_open_stateid_locked(state, stateid, open_flags);
+        nfs_set_open_stateid_locked(state, stateid, fmode);
        write_sequnlock(&state->seqlock);
 }
-static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags)
+static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
 {
-        open_flags &= (FMODE_READ|FMODE_WRITE);
        /*
         * Protect the call to nfs4_state_set_mode_locked and
         * serialise the stateid update
@@ -401,20 +481,60 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_sta
                set_bit(NFS_DELEGATED_STATE, &state->flags);
        }
        if (open_stateid != NULL)
-                nfs_set_open_stateid_locked(state, open_stateid, open_flags);
+                nfs_set_open_stateid_locked(state, open_stateid, fmode);
        write_sequnlock(&state->seqlock);
        spin_lock(&state->owner->so_lock);
-        update_open_stateflags(state, open_flags);
+        update_open_stateflags(state, fmode);
        spin_unlock(&state->owner->so_lock);
 }
-static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags)
+static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+{
+        struct nfs_inode *nfsi = NFS_I(state->inode);
+        struct nfs_delegation *deleg_cur;
+        int ret = 0;
+        fmode &= (FMODE_READ|FMODE_WRITE);
+        rcu_read_lock();
+        deleg_cur = rcu_dereference(nfsi->delegation);
+        if (deleg_cur == NULL)
+                goto no_delegation;
+        spin_lock(&deleg_cur->lock);
+        if (nfsi->delegation != deleg_cur ||
+            (deleg_cur->type & fmode) != fmode)
+                goto no_delegation_unlock;
+        if (delegation == NULL)
+                delegation = &deleg_cur->stateid;
+        else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
+                goto no_delegation_unlock;
+        nfs_mark_delegation_referenced(deleg_cur);
+        __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+        ret = 1;
+no_delegation_unlock:
+        spin_unlock(&deleg_cur->lock);
+no_delegation:
+        rcu_read_unlock();
+        if (!ret && open_stateid != NULL) {
+                __update_open_stateid(state, open_stateid, NULL, fmode);
+                ret = 1;
+        }
+        return ret;
+}
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
 {
        struct nfs_delegation *delegation;
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation == NULL || (delegation->type & open_flags) == open_flags) {
+        if (delegation == NULL || (delegation->type & fmode) == fmode) {
                rcu_read_unlock();
                return;
        }
@@ -427,27 +547,28 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
        struct nfs4_state *state = opendata->state;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_delegation *delegation;
-        int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL);
+        int open_mode = opendata->o_arg.open_flags & O_EXCL;
+        fmode_t fmode = opendata->o_arg.fmode;
        nfs4_stateid stateid;
        int ret = -EAGAIN;
-        rcu_read_lock();
-        delegation = rcu_dereference(nfsi->delegation);
        for (;;) {
-                if (can_open_cached(state, open_mode)) {
+                if (can_open_cached(state, fmode, open_mode)) {
                        spin_lock(&state->owner->so_lock);
-                        if (can_open_cached(state, open_mode)) {
+                        if (can_open_cached(state, fmode, open_mode)) {
-                                update_open_stateflags(state, open_mode);
+                                update_open_stateflags(state, fmode);
                                spin_unlock(&state->owner->so_lock);
-                                rcu_read_unlock();
                                goto out_return_state;
                        }
                        spin_unlock(&state->owner->so_lock);
                }
-                if (delegation == NULL)
+                rcu_read_lock();
-                        break;
+                delegation = rcu_dereference(nfsi->delegation);
-                if (!can_open_delegated(delegation, open_mode))
+                if (delegation == NULL ||
+                    !can_open_delegated(delegation, fmode)) {
+                        rcu_read_unlock();
                        break;
+                }
                /* Save the delegation */
                memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
                rcu_read_unlock();
@@ -455,19 +576,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                if (ret != 0)
                        goto out;
                ret = -EAGAIN;
-                rcu_read_lock();
-                delegation = rcu_dereference(nfsi->delegation);
+                /* Try to update the stateid using the delegation */
-                /* If no delegation, try a cached open */
+                if (update_open_stateid(state, NULL, &stateid, fmode))
-                if (delegation == NULL)
+                        goto out_return_state;
-                        continue;
-                /* Is the delegation still valid? */
-                if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0)
-                        continue;
-                rcu_read_unlock();
-                update_open_stateid(state, NULL, &stateid, open_mode);
-                goto out_return_state;
        }
-        rcu_read_unlock();
 out:
        return ERR_PTR(ret);
 out_return_state:
@@ -480,7 +593,6 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
        struct inode *inode;
        struct nfs4_state *state = NULL;
        struct nfs_delegation *delegation;
-        nfs4_stateid *deleg_stateid = NULL;
        int ret;
        if (!data->rpc_done) {
@@ -507,7 +619,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
                if (delegation)
                        delegation_flags = delegation->flags;
                rcu_read_unlock();
-                if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
+                if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
                        nfs_inode_set_delegation(state->inode,
                                        data->owner->so_cred,
                                        &data->o_res);
@@ -516,12 +628,9 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
                                        data->owner->so_cred,
                                        &data->o_res);
        }
-        rcu_read_lock();
-        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        update_open_stateid(state, &data->o_res.stateid, NULL,
-        if (delegation != NULL)
+                        data->o_arg.fmode);
-                deleg_stateid = &delegation->stateid;
-        update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags);
-        rcu_read_unlock();
        iput(inode);
 out:
        return state;
@@ -552,7 +661,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
        struct nfs4_opendata *opendata;
-        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL);
+        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -560,12 +669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
        return opendata;
 }
-static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res)
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
 {
        struct nfs4_state *newstate;
        int ret;
-        opendata->o_arg.open_flags = openflags;
+        opendata->o_arg.open_flags = 0;
+        opendata->o_arg.fmode = fmode;
        memset(&opendata->o_res, 0, sizeof(opendata->o_res));
        memset(&opendata->c_res, 0, sizeof(opendata->c_res));
        nfs4_init_opendata_res(opendata);
@@ -575,7 +685,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf
        newstate = nfs4_opendata_to_nfs4_state(opendata);
        if (IS_ERR(newstate))
                return PTR_ERR(newstate);
-        nfs4_close_state(&opendata->path, newstate, openflags);
+        nfs4_close_state(&opendata->path, newstate, fmode);
        *res = newstate;
        return 0;
 }
@@ -631,7 +741,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 {
        struct nfs_delegation *delegation;
        struct nfs4_opendata *opendata;
-        int delegation_type = 0;
+        fmode_t delegation_type = 0;
        int status;
        opendata = nfs4_open_recoverdata_alloc(ctx, state);
@@ -641,7 +751,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
        opendata->o_arg.fh = NFS_FH(state->inode);
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(state->inode)->delegation);
-        if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0)
+        if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
                delegation_type = delegation->type;
        rcu_read_unlock();
        opendata->o_arg.u.delegation_type = delegation_type;
@@ -744,7 +854,7 @@ static void nfs4_open_confirm_release(void *calldata)
                goto out_free;
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
-                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+                nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
        nfs4_opendata_put(data);
 }
@@ -808,12 +918,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        if (data->state != NULL) {
                struct nfs_delegation *delegation;
-                if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL)))
+                if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
                        goto out_no_action;
                rcu_read_lock();
                delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
                if (delegation != NULL &&
-                   (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) {
+                    test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) {
                        rcu_read_unlock();
                        goto out_no_action;
                }
@@ -877,7 +987,7 @@ static void nfs4_open_release(void *calldata)
                goto out_free;
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
-                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+                nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
        nfs4_opendata_put(data);
 }
@@ -955,10 +1065,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
        int ret;
        for (;;) {
-                ret = nfs4_wait_clnt_recover(server->client, clp);
+                ret = nfs4_wait_clnt_recover(clp);
                if (ret != 0)
                        return ret;
-                if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
                nfs4_schedule_state_recovery(clp);
        }
@@ -993,8 +1104,9 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
        do {
                err = _nfs4_open_expired(ctx, state);
-                if (err == -NFS4ERR_DELAY)
+                if (err != -NFS4ERR_DELAY)
-                        nfs4_handle_exception(server, err, &exception);
+                        break;
+                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
        return err;
 }
@@ -1031,12 +1143,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 /*
 * Returns a referenced nfs4_state
 */
-static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
 {
        struct nfs4_state_owner  *sp;
        struct nfs4_state     *state = NULL;
        struct nfs_server       *server = NFS_SERVER(dir);
-        struct nfs_client *clp = server->nfs_client;
        struct nfs4_opendata *opendata;
        int status;
@@ -1050,12 +1161,11 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
        if (status != 0)
                goto err_put_state_owner;
        if (path->dentry->d_inode != NULL)
-                nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
+                nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
-        down_read(&clp->cl_sem);
        status = -ENOMEM;
-        opendata = nfs4_opendata_alloc(path, sp, flags, sattr);
+        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
        if (opendata == NULL)
-                goto err_release_rwsem;
+                goto err_put_state_owner;
        if (path->dentry->d_inode != NULL)
                opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
@@ -1073,13 +1183,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
                goto err_opendata_put;
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
-        up_read(&clp->cl_sem);
        *res = state;
        return 0;
 err_opendata_put:
        nfs4_opendata_put(opendata);
-err_release_rwsem:
-        up_read(&clp->cl_sem);
 err_put_state_owner:
        nfs4_put_state_owner(sp);
 out_err:
@@ -1088,14 +1195,14 @@ out_err:
 }
-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
 {
        struct nfs4_exception exception = { };
        struct nfs4_state *res;
        int status;
        do {
-                status = _nfs4_do_open(dir, path, flags, sattr, cred, &res);
+                status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res);
                if (status == 0)
                        break;
                /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1230,10 +1337,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        renew_lease(server, calldata->timestamp);
                        break;
                case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_OLD_STATEID:
+                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_EXPIRED:
-                        break;
+                        if (calldata->arg.fmode == 0)
+                                break;
                default:
-                        if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
                                rpc_restart_call(task);
                                return;
                        }
@@ -1272,10 +1382,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        nfs_fattr_init(calldata->res.fattr);
        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.open_flags = FMODE_READ;
+                calldata->arg.fmode = FMODE_READ;
        } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.open_flags = FMODE_WRITE;
+                calldata->arg.fmode = FMODE_WRITE;
        }
        calldata->timestamp = jiffies;
        rpc_call_start(task);
@@ -1328,6 +1438,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
+        calldata->arg.fmode = 0;
        calldata->arg.bitmask = server->attr_bitmask;
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
@@ -1354,13 +1465,13 @@ out:
        return status;
 }
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
        struct file *filp;
        int ret;
        /* If the open_intent is for execute, we have an extra check to make */
-        if (nd->intent.open.flags & FMODE_EXEC) {
+        if (fmode & FMODE_EXEC) {
                ret = nfs_may_open(state->inode,
                                state->owner->so_cred,
                                nd->intent.open.flags);
@@ -1376,7 +1487,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
        }
        ret = PTR_ERR(filp);
 out_close:
-        nfs4_close_sync(path, state, nd->intent.open.flags);
+        nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
        return ret;
 }
@@ -1392,6 +1503,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct rpc_cred *cred;
        struct nfs4_state *state;
        struct dentry *res;
+        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
@@ -1409,7 +1521,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
+        state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                if (PTR_ERR(state) == -ENOENT) {
@@ -1424,7 +1536,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                path.dentry = res;
        nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
        nfs_unblock_sillyrename(parent);
-        nfs4_intent_set_file(nd, &path, state);
+        nfs4_intent_set_file(nd, &path, state, fmode);
        return res;
 }
@@ -1437,11 +1549,12 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
        };
        struct rpc_cred *cred;
        struct nfs4_state *state;
+        fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        state = nfs4_do_open(dir, &path, openflags, NULL, cred);
+        state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                switch (PTR_ERR(state)) {
@@ -1458,10 +1571,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
        }
        if (state->inode == dentry->d_inode) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                nfs4_intent_set_file(nd, &path, state);
+                nfs4_intent_set_file(nd, &path, state, fmode);
                return 1;
        }
-        nfs4_close_sync(&path, state, openflags);
+        nfs4_close_sync(&path, state, fmode);
 out_drop:
        d_drop(dentry);
        return 0;
@@ -1887,6 +2000,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        };
        struct nfs4_state *state;
        struct rpc_cred *cred;
+        fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
        int status = 0;
        cred = rpc_lookup_cred();
@@ -1894,7 +2008,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                status = PTR_ERR(cred);
                goto out;
        }
-        state = nfs4_do_open(dir, &path, flags, sattr, cred);
+        state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
@@ -1910,9 +2024,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                nfs_post_op_update_inode(state->inode, &fattr);
        }
        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
-                status = nfs4_intent_set_file(nd, &path, state);
+                status = nfs4_intent_set_file(nd, &path, state, fmode);
        else
-                nfs4_close_sync(&path, state, flags);
+                nfs4_close_sync(&path, state, fmode);
 out_putcred:
        put_rpccred(cred);
 out:
@@ -1974,7 +2088,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        struct nfs_removeres *res = task->tk_msg.rpc_resp;
-        if (nfs4_async_handle_error(task, res->server) == -EAGAIN)
+        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
        nfs_post_op_update_inode(dir, &res->dir_attr);
@@ -2402,7 +2516,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2423,7 +2537,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2449,7 +2563,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2742,19 +2856,25 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
        struct nfs_client *clp = server->nfs_client;
        if (!clp || task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
+                case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OPENMODE:
+                        if (state == NULL)
+                                break;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
                        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
                        nfs4_schedule_state_recovery(clp);
-                        if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
+                        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
                                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
                        task->tk_status = 0;
                        return -EAGAIN;
@@ -2772,79 +2892,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
        return 0;
 }
-static int nfs4_wait_bit_killable(void *word)
-{
-        if (fatal_signal_pending(current))
-                return -ERESTARTSYS;
-        schedule();
-        return 0;
-}
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
-{
-        int res;
-        might_sleep();
-        rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
-        res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
-                        nfs4_wait_bit_killable, TASK_KILLABLE);
-        rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
-        return res;
-}
-static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
-{
-        int res = 0;
-        might_sleep();
-        if (*timeout <= 0)
-                *timeout = NFS4_POLL_RETRY_MIN;
-        if (*timeout > NFS4_POLL_RETRY_MAX)
-                *timeout = NFS4_POLL_RETRY_MAX;
-        schedule_timeout_killable(*timeout);
-        if (fatal_signal_pending(current))
-                res = -ERESTARTSYS;
-        *timeout <<= 1;
-        return res;
-}
-/* This is the error handling routine for processes that are allowed
- * to sleep.
- */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
-{
-        struct nfs_client *clp = server->nfs_client;
-        int ret = errorcode;
-        exception->retry = 0;
-        switch(errorcode) {
-                case 0:
-                        return 0;
-                case -NFS4ERR_STALE_CLIENTID:
-                case -NFS4ERR_STALE_STATEID:
-                case -NFS4ERR_EXPIRED:
-                        nfs4_schedule_state_recovery(clp);
-                        ret = nfs4_wait_clnt_recover(server->client, clp);
-                        if (ret == 0)
-                                exception->retry = 1;
-                        break;
-                case -NFS4ERR_FILE_OPEN:
-                case -NFS4ERR_GRACE:
-                case -NFS4ERR_DELAY:
-                        ret = nfs4_delay(server->client, &exception->timeout);
-                        if (ret != 0)
-                                break;
-                case -NFS4ERR_OLD_STATEID:
-                        exception->retry = 1;
-        }
-        /* We failed to handle the error */
-        return nfs4_map_errors(ret);
-}
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
        nfs4_verifier sc_verifier;
@@ -2916,7 +2963,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
                spin_lock(&clp->cl_lock);
                clp->cl_lease_time = fsinfo.lease_time * HZ;
                clp->cl_last_renewal = now;
-                clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                spin_unlock(&clp->cl_lock);
        }
        return status;
@@ -3074,7 +3120,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        struct nfs4_lock_state *lsp;
        int status;
-        down_read(&clp->cl_sem);
        arg.lock_owner.clientid = clp->cl_clientid;
        status = nfs4_set_lock_state(state, request);
        if (status != 0)
@@ -3091,7 +3136,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        }
        request->fl_ops->fl_release_private(request);
 out:
-        up_read(&clp->cl_sem);
        return status;
 }
@@ -3181,11 +3225,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                                        sizeof(calldata->lsp->ls_stateid.data));
                        renew_lease(calldata->server, calldata->timestamp);
                        break;
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
                        break;
                default:
-                        if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN)
+                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
                                rpc_restart_call(task);
        }
 }
@@ -3248,6 +3294,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
+        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_seqid *seqid;
        struct nfs4_lock_state *lsp;
        struct rpc_task *task;
@@ -3257,8 +3304,12 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        status = nfs4_set_lock_state(state, request);
        /* Unlock _before_ we do the RPC call */
        request->fl_flags |= FL_EXISTS;
-        if (do_vfs_lock(request->fl_file, request) == -ENOENT)
+        down_read(&nfsi->rwsem);
+        if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+                up_read(&nfsi->rwsem);
                goto out;
+        }
+        up_read(&nfsi->rwsem);
        if (status != 0)
                goto out;
        /* Is this a delegated lock? */
@@ -3484,7 +3535,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs_client *clp = state->owner->so_client;
+        struct nfs_inode *nfsi = NFS_I(state->inode);
        unsigned char fl_flags = request->fl_flags;
        int status;
@@ -3496,19 +3547,13 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        status = do_vfs_lock(request->fl_file, request);
        if (status < 0)
                goto out;
-        down_read(&clp->cl_sem);
+        down_read(&nfsi->rwsem);
        if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
-                struct nfs_inode *nfsi = NFS_I(state->inode);
                /* Yes: cache locks! */
-                down_read(&nfsi->rwsem);
                /* ...but avoid races with delegation recall... */
-                if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+                request->fl_flags = fl_flags & ~FL_SLEEP;
-                        request->fl_flags = fl_flags & ~FL_SLEEP;
+                status = do_vfs_lock(request->fl_file, request);
-                        status = do_vfs_lock(request->fl_file, request);
+                goto out_unlock;
-                        up_read(&nfsi->rwsem);
-                        goto out_unlock;
-                }
-                up_read(&nfsi->rwsem);
        }
        status = _nfs4_do_setlk(state, cmd, request, 0);
        if (status != 0)
@@ -3518,7 +3563,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        if (do_vfs_lock(request->fl_file, request) < 0)
                printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 out_unlock:
-        up_read(&clp->cl_sem);
+        up_read(&nfsi->rwsem);
 out:
        request->fl_flags = fl_flags;
        return status;
@@ -3664,11 +3709,15 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 }
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
 };
-struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = {
+struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs4_open_expired,
        .recover_lock   = nfs4_lock_expired,
 };
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3305acbbe2a..f524e932ff7 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -65,7 +65,6 @@ nfs4_renew_state(struct work_struct *work)
        long lease, timeout;
        unsigned long last, now;
-        down_read(&clp->cl_sem);
        dprintk("%s: start\n", __func__);
        /* Are there any active superblocks? */
        if (list_empty(&clp->cl_superblocks))
@@ -77,17 +76,19 @@ nfs4_renew_state(struct work_struct *work)
        timeout = (2 * lease) / 3 + (long)last - (long)now;
        /* Are we close to a lease timeout? */
        if (time_after(now, last + lease/3)) {
-                cred = nfs4_get_renew_cred(clp);
+                cred = nfs4_get_renew_cred_locked(clp);
+                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        if (list_empty(&clp->cl_delegations)) {
-                        spin_unlock(&clp->cl_lock);
+                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                goto out;
+                        }
                        nfs_expire_all_delegations(clp);
-                        goto out;
+                } else {
+                        /* Queue an asynchronous RENEW. */
+                        nfs4_proc_async_renew(clp, cred);
+                        put_rpccred(cred);
                }
-                spin_unlock(&clp->cl_lock);
-                /* Queue an asynchronous RENEW. */
-                nfs4_proc_async_renew(clp, cred);
-                put_rpccred(cred);
                timeout = (2 * lease) / 3;
                spin_lock(&clp->cl_lock);
        } else
@@ -100,12 +101,11 @@ nfs4_renew_state(struct work_struct *work)
        cancel_delayed_work(&clp->cl_renewd);
        schedule_delayed_work(&clp->cl_renewd, timeout);
        spin_unlock(&clp->cl_lock);
+        nfs_expire_unreferenced_delegations(clp);
 out:
-        up_read(&clp->cl_sem);
        dprintk("%s: done\n", __func__);
 }
-/* Must be called with clp->cl_sem locked for writes */
 void
 nfs4_schedule_state_renewal(struct nfs_client *clp)
 {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 401ef8b28f9..2022fe47966 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -71,14 +71,12 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
        return status;
 }
-static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp)
+static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
 {
        struct rpc_cred *cred = NULL;
-        spin_lock(&clp->cl_lock);
        if (clp->cl_machine_cred != NULL)
                cred = get_rpccred(clp->cl_machine_cred);
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
@@ -94,7 +92,7 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
@@ -110,13 +108,24 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
        return cred;
 }
+static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+{
+        struct rpc_cred *cred;
+        spin_lock(&clp->cl_lock);
+        cred = nfs4_get_renew_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
 static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct rpc_cred *cred;
-        cred = nfs4_get_machine_cred(clp);
+        spin_lock(&clp->cl_lock);
+        cred = nfs4_get_machine_cred_locked(clp);
        if (cred != NULL)
                goto out;
        pos = rb_first(&clp->cl_state_owners);
@@ -125,6 +134,7 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
                cred = get_rpccred(sp->so_cred);
        }
 out:
+        spin_unlock(&clp->cl_lock);
        return cred;
 }
@@ -295,10 +305,6 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
        }
 }
-/*
- * Note: must be called with clp->cl_sem held in order to prevent races
- *       with reboot recovery!
- */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -327,10 +333,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
-/*
- * Must be called with clp->cl_sem held in order to avoid races
- * with state recovery...
- */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_client;
@@ -361,18 +363,18 @@ nfs4_alloc_open_state(void)
 }
 void
-nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode)
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
 {
-        if (state->state == mode)
+        if (state->state == fmode)
                return;
        /* NB! List reordering - see the reclaim code for why.  */
-        if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+        if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
-                if (mode & FMODE_WRITE)
+                if (fmode & FMODE_WRITE)
                        list_move(&state->open_states, &state->owner->so_states);
                else
                        list_move_tail(&state->open_states, &state->owner->so_states);
        }
-        state->state = mode;
+        state->state = fmode;
 }
 static struct nfs4_state *
@@ -432,10 +434,6 @@ out:
        return state;
 }
-/*
- * Beware! Caller must be holding exactly one
- * reference to clp->cl_sem!
- */
 void nfs4_put_open_state(struct nfs4_state *state)
 {
        struct inode *inode = state->inode;
@@ -456,16 +454,16 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
 * Close the current file.
 */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
 {
        struct nfs4_state_owner *owner = state->owner;
        int call_close = 0;
-        int newstate;
+        fmode_t newstate;
        atomic_inc(&owner->so_count);
        /* Protect against nfs4_find_state() */
        spin_lock(&owner->so_lock);
-        switch (mode & (FMODE_READ | FMODE_WRITE)) {
+        switch (fmode & (FMODE_READ | FMODE_WRITE)) {
                case FMODE_READ:
                        state->n_rdonly--;
                        break;
@@ -500,14 +498,14 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod
                nfs4_do_close(path, state, wait);
 }
-void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, mode, 0);
+        __nfs4_close(path, state, fmode, 0);
 }
-void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, mode, 1);
+        __nfs4_close(path, state, fmode, 1);
 }
 /*
@@ -568,7 +566,6 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 * Return a compatible lock_state. If no initialized lock_state structure
 * exists, return an uninitialized one.
 *
- * The caller must be holding clp->cl_sem
 */
 static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
@@ -770,32 +767,34 @@ unlock:
        return status;
 }
-static int reclaimer(void *);
+static int nfs4_run_state_manager(void *);
-static inline void nfs4_clear_recover_bit(struct nfs_client *clp)
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
 {
        smp_mb__before_clear_bit();
-        clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
+        clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
        smp_mb__after_clear_bit();
-        wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER);
+        wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
        rpc_wake_up(&clp->cl_rpcwaitq);
 }
 /*
- * State recovery routine
+ * Schedule the nfs_client asynchronous state management routine
 */
-static void nfs4_recover_state(struct nfs_client *clp)
+void nfs4_schedule_state_manager(struct nfs_client *clp)
 {
        struct task_struct *task;
+        if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+                return;
        __module_get(THIS_MODULE);
        atomic_inc(&clp->cl_count);
-        task = kthread_run(reclaimer, clp, "%s-reclaim",
+        task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
                                rpc_peeraddr2str(clp->cl_rpcclient,
                                                        RPC_DISPLAY_ADDR));
        if (!IS_ERR(task))
                return;
-        nfs4_clear_recover_bit(clp);
+        nfs4_clear_state_manager_bit(clp);
        nfs_put_client(clp);
        module_put(THIS_MODULE);
 }
@@ -807,16 +806,42 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
-        if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
+        if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-                nfs4_recover_state(clp);
+                set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+        nfs4_schedule_state_manager(clp);
 }
-static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+        /* Don't recover state that expired before the reboot */
+        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+                clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+                return 0;
+        }
+        set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+        return 1;
+}
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+        set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+        set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+        return 1;
+}
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
        struct inode *inode = state->inode;
+        struct nfs_inode *nfsi = NFS_I(inode);
        struct file_lock *fl;
        int status = 0;
+        down_write(&nfsi->rwsem);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
@@ -839,12 +864,14 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
                                goto out_err;
                }
        }
+        up_write(&nfsi->rwsem);
        return 0;
 out_err:
+        up_write(&nfsi->rwsem);
        return status;
 }
-static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct nfs4_state_owner *sp)
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
 {
        struct nfs4_state *state;
        struct nfs4_lock_state *lock;
@@ -858,28 +885,34 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
         * recovering after a network partition or a reboot from a
         * server that doesn't support a grace period.
         */
+restart:
+        spin_lock(&sp->so_lock);
        list_for_each_entry(state, &sp->so_states, open_states) {
+                if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+                        continue;
                if (state->state == 0)
                        continue;
+                atomic_inc(&state->count);
+                spin_unlock(&sp->so_lock);
                status = ops->recover_open(sp, state);
                if (status >= 0) {
-                        status = nfs4_reclaim_locks(ops, state);
+                        status = nfs4_reclaim_locks(state, ops);
-                        if (status < 0)
+                        if (status >= 0) {
-                                goto out_err;
+                                list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                        list_for_each_entry(lock, &state->lock_states, ls_locks) {
+                                        if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
-                                if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
+                                                printk("%s: Lock reclaim failed!\n",
-                                        printk("%s: Lock reclaim failed!\n",
                                                        __func__);
+                                }
+                                nfs4_put_open_state(state);
+                                goto restart;
                        }
-                        continue;
                }
                switch (status) {
                        default:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
                                                __func__, status);
                        case -ENOENT:
-                        case -NFS4ERR_RECLAIM_BAD:
+                        case -ESTALE:
-                        case -NFS4ERR_RECLAIM_CONFLICT:
                                /*
                                 * Open state on this file cannot be recovered
                                 * All we can do is revert to using the zero stateid.
@@ -889,84 +922,176 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -NFS4ERR_RECLAIM_BAD:
+                        case -NFS4ERR_RECLAIM_CONFLICT:
+                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+                                break;
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_NO_GRACE:
+                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
                        case -NFS4ERR_STALE_CLIENTID:
                                goto out_err;
                }
+                nfs4_put_open_state(state);
+                goto restart;
        }
+        spin_unlock(&sp->so_lock);
        return 0;
 out_err:
+        nfs4_put_open_state(state);
        return status;
 }
-static void nfs4_state_mark_reclaim(struct nfs_client *clp)
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+        struct nfs4_lock_state *lock;
+        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+        clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+        clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+        clear_bit(NFS_O_RDWR_STATE, &state->flags);
+        list_for_each_entry(lock, &state->lock_states, ls_locks) {
+                lock->ls_seqid.flags = 0;
+                lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
+        }
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        struct nfs4_lock_state *lock;
        /* Reset all sequence ids to zero */
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-                sp->so_seqid.counter = 0;
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+                        if (mark_reclaim(clp, state))
-                        clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+                                nfs4_clear_open_state(state);
-                        clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-                        clear_bit(NFS_O_RDWR_STATE, &state->flags);
-                        list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                                lock->ls_seqid.counter = 0;
-                                lock->ls_seqid.flags = 0;
-                                lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
-                        }
                }
                spin_unlock(&sp->so_lock);
        }
 }
-static int reclaimer(void *ptr)
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+        /* Mark all delegations for reclaim */
+        nfs_delegation_mark_reclaim(clp);
+        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
-        struct nfs_client *clp = ptr;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct nfs4_state_recovery_ops *ops;
+        struct nfs4_state *state;
-        struct rpc_cred *cred;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return;
+        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&sp->so_lock);
+                list_for_each_entry(state, &sp->so_states, open_states) {
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                                continue;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
+                }
+                spin_unlock(&sp->so_lock);
+        }
+        nfs_delegation_reap_unclaimed(clp);
+}
+static void nfs_delegation_clear_all(struct nfs_client *clp)
+{
+        nfs_delegation_mark_reclaim(clp);
+        nfs_delegation_reap_unclaimed(clp);
+}
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+        nfs_delegation_clear_all(clp);
+        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp)
+{
+        clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+}
+static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+        switch (error) {
+                case -NFS4ERR_CB_PATH_DOWN:
+                        nfs_handle_cb_pathdown(clp);
+                        break;
+                case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_LEASE_MOVED:
+                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        nfs4_state_start_reclaim_reboot(clp);
+                        break;
+                case -NFS4ERR_EXPIRED:
+                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        nfs4_state_start_reclaim_nograce(clp);
+        }
+}
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+        struct rb_node *pos;
        int status = 0;
-        allow_signal(SIGKILL);
+restart:
+        spin_lock(&clp->cl_lock);
+        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                        continue;
+                atomic_inc(&sp->so_count);
+                spin_unlock(&clp->cl_lock);
+                status = nfs4_reclaim_open_state(sp, ops);
+                if (status < 0) {
+                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                        nfs4_put_state_owner(sp);
+                        nfs4_recovery_handle_error(clp, status);
+                        return status;
+                }
+                nfs4_put_state_owner(sp);
+                goto restart;
+        }
+        spin_unlock(&clp->cl_lock);
+        return status;
+}
-        /* Ensure exclusive access to NFSv4 state */
+static int nfs4_check_lease(struct nfs_client *clp)
-        down_write(&clp->cl_sem);
+{
-        /* Are there any NFS mounts out there? */
+        struct rpc_cred *cred;
-        if (list_empty(&clp->cl_superblocks))
+        int status = -NFS4ERR_EXPIRED;
-                goto out;
-restart_loop:
+        /* Is the client already known to have an expired lease? */
-        ops = &nfs4_network_partition_recovery_ops;
+        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-        /* Are there any open files on this volume? */
+                return 0;
        cred = nfs4_get_renew_cred(clp);
-        if (cred != NULL) {
+        if (cred == NULL) {
-                /* Yes there are: try to renew the old lease */
+                cred = nfs4_get_setclientid_cred(clp);
-                status = nfs4_proc_renew(clp, cred);
+                if (cred == NULL)
-                put_rpccred(cred);
+                        goto out;
-                switch (status) {
-                        case 0:
-                        case -NFS4ERR_CB_PATH_DOWN:
-                                goto out;
-                        case -NFS4ERR_STALE_CLIENTID:
-                        case -NFS4ERR_LEASE_MOVED:
-                                ops = &nfs4_reboot_recovery_ops;
-                }
-        } else {
-                /* "reboot" to ensure we clear all state on the server */
-                clp->cl_boot_time = CURRENT_TIME;
        }
-        /* We're going to have to re-establish a clientid */
+        status = nfs4_proc_renew(clp, cred);
-        nfs4_state_mark_reclaim(clp);
+        put_rpccred(cred);
-        status = -ENOENT;
+out:
+        nfs4_recovery_handle_error(clp, status);
+        return status;
+}
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+        struct rpc_cred *cred;
+        int status = -ENOENT;
        cred = nfs4_get_setclientid_cred(clp);
        if (cred != NULL) {
                status = nfs4_init_client(clp, cred);
@@ -974,42 +1099,90 @@ restart_loop:
                /* Handle case where the user hasn't set up machine creds */
                if (status == -EACCES && cred == clp->cl_machine_cred) {
                        nfs4_clear_machine_cred(clp);
-                        goto restart_loop;
+                        status = -EAGAIN;
                }
        }
-        if (status)
+        return status;
-                goto out_error;
+}
-        /* Mark all delegations for reclaim */
-        nfs_delegation_mark_reclaim(clp);
+static void nfs4_state_manager(struct nfs_client *clp)
-        /* Note: list is protected by exclusive lock on cl->cl_sem */
+{
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        int status = 0;
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-                status = nfs4_reclaim_open_state(ops, sp);
+        /* Ensure exclusive access to NFSv4 state */
-                if (status < 0) {
+        for(;;) {
-                        if (status == -NFS4ERR_NO_GRACE) {
+                if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
-                                ops = &nfs4_network_partition_recovery_ops;
+                        /* We're going to have to re-establish a clientid */
-                                status = nfs4_reclaim_open_state(ops, sp);
+                        status = nfs4_reclaim_lease(clp);
+                        if (status) {
+                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                if (status == -EAGAIN)
+                                        continue;
+                                goto out_error;
                        }
+                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+                }
+                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+                        status = nfs4_check_lease(clp);
+                        if (status != 0)
+                                continue;
+                }
+                /* First recover reboot state... */
+                if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+                        status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
                        if (status == -NFS4ERR_STALE_CLIENTID)
-                                goto restart_loop;
+                                continue;
-                        if (status == -NFS4ERR_EXPIRED)
+                        nfs4_state_end_reclaim_reboot(clp);
-                                goto restart_loop;
+                        continue;
+                }
+                /* Now recover expired state... */
+                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+                        status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
+                        if (status < 0) {
+                                set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+                                if (status == -NFS4ERR_STALE_CLIENTID)
+                                        continue;
+                                if (status == -NFS4ERR_EXPIRED)
+                                        continue;
+                                goto out_error;
+                        } else
+                                nfs4_state_end_reclaim_nograce(clp);
+                        continue;
                }
+                if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+                        nfs_client_return_marked_delegations(clp);
+                        continue;
+                }
+                nfs4_clear_state_manager_bit(clp);
+                /* Did we race with an attempt to give us more work? */
+                if (clp->cl_state == 0)
+                        break;
+                if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+                        break;
        }
-        nfs_delegation_reap_unclaimed(clp);
+        return;
-out:
+out_error:
-        up_write(&clp->cl_sem);
+        printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
-        if (status == -NFS4ERR_CB_PATH_DOWN)
+                        " with error %d\n", clp->cl_hostname, -status);
-                nfs_handle_cb_pathdown(clp);
+        if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-        nfs4_clear_recover_bit(clp);
+                nfs4_state_end_reclaim_reboot(clp);
+        nfs4_clear_state_manager_bit(clp);
+}
+static int nfs4_run_state_manager(void *ptr)
+{
+        struct nfs_client *clp = ptr;
+        allow_signal(SIGKILL);
+        nfs4_state_manager(clp);
        nfs_put_client(clp);
        module_put_and_exit(0);
        return 0;
-out_error:
-        printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
-                        " with error %d\n", clp->cl_hostname, -status);
-        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-        goto out;
 }
 /*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b916297d233..d1e4c8f8a0a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -8,7 +8,7 @@
 *
 *  Kendrick Smith <kmsmith@umich.edu>
 *  Andy Adamson   <andros@umich.edu>
- * 
+ *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
@@ -67,7 +67,7 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_MAXTAGLEN          0
 #endif
-/* lock,open owner id: 
+/* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
 #define open_owner_id_maxsz     (1 + 4)
@@ -541,6 +541,7 @@ static struct {
 struct compound_hdr {
        int32_t         status;
        uint32_t        nops;
+        __be32 *        nops_p;
        uint32_t        taglen;
        char *          tag;
 };
@@ -578,7 +579,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
        xdr_encode_opaque(p, str, len);
 }
-static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -588,8 +589,13 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
        WRITE32(hdr->taglen);
        WRITEMEM(hdr->tag, hdr->taglen);
        WRITE32(NFS4_MINOR_VERSION);
+        hdr->nops_p = p;
        WRITE32(hdr->nops);
-        return 0;
+}
+static void encode_nops(struct compound_hdr *hdr)
+{
+        *hdr->nops_p = htonl(hdr->nops);
 }
 static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
@@ -601,7 +607,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
        xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
 }
-static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
 {
        char owner_name[IDMAP_NAMESZ];
        char owner_group[IDMAP_NAMESZ];
@@ -612,7 +618,6 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        int len;
        uint32_t bmval0 = 0;
        uint32_t bmval1 = 0;
-        int status;
        /*
         * We reserve enough space to write the entire attribute buffer at once.
@@ -709,7 +714,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
                bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
                WRITE32(NFS4_SET_TO_SERVER_TIME);
        }
-        
        /*
         * Now we backfill the bitmap and the attribute buffer length.
         */
@@ -723,23 +728,20 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        *q++ = htonl(bmval1);
        *q++ = htonl(len);
-        status = 0;
 /* out: */
-        return status;
 }
-static int encode_access(struct xdr_stream *xdr, u32 access)
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(8);
        WRITE32(OP_ACCESS);
        WRITE32(access);
-        
+        hdr->nops++;
-        return 0;
 }
-static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -747,26 +749,24 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
        WRITE32(OP_CLOSE);
        WRITE32(arg->seqid->sequence->counter);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-        
+        hdr->nops++;
-        return 0;
 }
-static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        
-        RESERVE_SPACE(16);
-        WRITE32(OP_COMMIT);
-        WRITE64(args->offset);
-        WRITE32(args->count);
-        return 0;
+        RESERVE_SPACE(16);
+        WRITE32(OP_COMMIT);
+        WRITE64(args->offset);
+        WRITE32(args->count);
+        hdr->nops++;
 }
-static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
 {
        __be32 *p;
-        
        RESERVE_SPACE(8);
        WRITE32(OP_CREATE);
        WRITE32(create->ftype);
@@ -791,64 +791,62 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
        RESERVE_SPACE(4 + create->name->len);
        WRITE32(create->name->len);
        WRITEMEM(create->name->name, create->name->len);
+        hdr->nops++;
-        return encode_attrs(xdr, create->attrs, create->server);
+        encode_attrs(xdr, create->attrs, create->server);
 }
-static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
+static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(12);
+        RESERVE_SPACE(12);
-        WRITE32(OP_GETATTR);
+        WRITE32(OP_GETATTR);
-        WRITE32(1);
+        WRITE32(1);
-        WRITE32(bitmap);
+        WRITE32(bitmap);
-        return 0;
+        hdr->nops++;
 }
-static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
+static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        RESERVE_SPACE(16);
-        WRITE32(OP_GETATTR);
+        WRITE32(OP_GETATTR);
-        WRITE32(2);
+        WRITE32(2);
-        WRITE32(bm0);
+        WRITE32(bm0);
-        WRITE32(bm1);
+        WRITE32(bm1);
-        return 0;
+        hdr->nops++;
 }
-static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr,
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-                        bitmask[0] & nfs4_fattr_bitmap[0],
+                           bitmask[1] & nfs4_fattr_bitmap[1], hdr);
-                        bitmask[1] & nfs4_fattr_bitmap[1]);
 }
-static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
-                        bitmask[1] & nfs4_fsinfo_bitmap[1]);
+                           bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
 }
-static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr,
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
-                                  bitmask[0] & nfs4_fs_locations_bitmap[0],
+                           bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
-                                  bitmask[1] & nfs4_fs_locations_bitmap[1]);
 }
-static int encode_getfh(struct xdr_stream *xdr)
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_GETFH);
+        hdr->nops++;
-        return 0;
 }
-static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -856,8 +854,7 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_LINK);
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
-        
+        hdr->nops++;
-        return 0;
 }
 static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -878,7 +875,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
 * opcode,type,reclaim,offset,length,new_lock_owner = 32
 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
 */
-static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -904,11 +901,10 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
                WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
                WRITE32(args->lock_seqid->sequence->counter);
        }
+        hdr->nops++;
-        return 0;
 }
-static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -921,11 +917,10 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
        WRITE32(16);
        WRITEMEM("lock id:", 8);
        WRITE64(args->lock_owner.id);
+        hdr->nops++;
-        return 0;
 }
-static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -936,11 +931,10 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
        WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
        WRITE64(args->fl->fl_start);
        WRITE64(nfs4_lock_length(args->fl));
+        hdr->nops++;
-        return 0;
 }
-static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        int len = name->len;
        __be32 *p;
@@ -949,27 +943,26 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_LOOKUP);
        WRITE32(len);
        WRITEMEM(name->name, len);
+        hdr->nops++;
-        return 0;
 }
-static void encode_share_access(struct xdr_stream *xdr, int open_flags)
+static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
        __be32 *p;
        RESERVE_SPACE(8);
-        switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
+        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
-                case FMODE_READ:
+        case FMODE_READ:
-                        WRITE32(NFS4_SHARE_ACCESS_READ);
+                WRITE32(NFS4_SHARE_ACCESS_READ);
-                        break;
+                break;
-                case FMODE_WRITE:
+        case FMODE_WRITE:
-                        WRITE32(NFS4_SHARE_ACCESS_WRITE);
+                WRITE32(NFS4_SHARE_ACCESS_WRITE);
-                        break;
+                break;
-                case FMODE_READ|FMODE_WRITE:
+        case FMODE_READ|FMODE_WRITE:
-                        WRITE32(NFS4_SHARE_ACCESS_BOTH);
+                WRITE32(NFS4_SHARE_ACCESS_BOTH);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                WRITE32(0);
        }
        WRITE32(0);             /* for linux, share_deny = 0 always */
 }
@@ -984,7 +977,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        RESERVE_SPACE(8);
        WRITE32(OP_OPEN);
        WRITE32(arg->seqid->sequence->counter);
-        encode_share_access(xdr, arg->open_flags);
+        encode_share_access(xdr, arg->fmode);
        RESERVE_SPACE(28);
        WRITE64(arg->clientid);
        WRITE32(16);
@@ -998,13 +991,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
        RESERVE_SPACE(4);
        switch(arg->open_flags & O_EXCL) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_CREATE_UNCHECKED);
+                WRITE32(NFS4_CREATE_UNCHECKED);
-                        encode_attrs(xdr, arg->u.attrs, arg->server);
+                encode_attrs(xdr, arg->u.attrs, arg->server);
-                        break;
+                break;
-                default:
+        default:
-                        WRITE32(NFS4_CREATE_EXCLUSIVE);
+                WRITE32(NFS4_CREATE_EXCLUSIVE);
-                        encode_nfs4_verifier(xdr, &arg->u.verifier);
+                encode_nfs4_verifier(xdr, &arg->u.verifier);
        }
 }
@@ -1014,33 +1007,33 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
        RESERVE_SPACE(4);
        switch (arg->open_flags & O_CREAT) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_OPEN_NOCREATE);
+                WRITE32(NFS4_OPEN_NOCREATE);
-                        break;
+                break;
-                default:
+        default:
-                        BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
+                BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-                        WRITE32(NFS4_OPEN_CREATE);
+                WRITE32(NFS4_OPEN_CREATE);
-                        encode_createmode(xdr, arg);
+                encode_createmode(xdr, arg);
        }
 }
-static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        switch (delegation_type) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_OPEN_DELEGATE_NONE);
+                WRITE32(NFS4_OPEN_DELEGATE_NONE);
-                        break;
+                break;
-                case FMODE_READ:
+        case FMODE_READ:
-                        WRITE32(NFS4_OPEN_DELEGATE_READ);
+                WRITE32(NFS4_OPEN_DELEGATE_READ);
-                        break;
+                break;
-                case FMODE_WRITE|FMODE_READ:
+        case FMODE_WRITE|FMODE_READ:
-                        WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+                WRITE32(NFS4_OPEN_DELEGATE_WRITE);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                BUG();
        }
 }
@@ -1053,7 +1046,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
        encode_string(xdr, name->len, name->name);
 }
-static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
        __be32 *p;
@@ -1072,27 +1065,27 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
        encode_string(xdr, name->len, name->name);
 }
-static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
 {
        encode_openhdr(xdr, arg);
        encode_opentype(xdr, arg);
        switch (arg->claim) {
-                case NFS4_OPEN_CLAIM_NULL:
+        case NFS4_OPEN_CLAIM_NULL:
-                        encode_claim_null(xdr, arg->name);
+                encode_claim_null(xdr, arg->name);
-                        break;
+                break;
-                case NFS4_OPEN_CLAIM_PREVIOUS:
+        case NFS4_OPEN_CLAIM_PREVIOUS:
-                        encode_claim_previous(xdr, arg->u.delegation_type);
+                encode_claim_previous(xdr, arg->u.delegation_type);
-                        break;
+                break;
-                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+        case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-                        encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+                encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                BUG();
        }
-        return 0;
+        hdr->nops++;
 }
-static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1100,11 +1093,10 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
        WRITE32(OP_OPEN_CONFIRM);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
+        hdr->nops++;
-        return 0;
 }
-static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1112,12 +1104,12 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
        WRITE32(OP_OPEN_DOWNGRADE);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
-        encode_share_access(xdr, arg->open_flags);
+        encode_share_access(xdr, arg->fmode);
-        return 0;
+        hdr->nops++;
 }
-static int
+static void
-encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
 {
        int len = fh->size;
        __be32 *p;
@@ -1126,18 +1118,16 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
        WRITE32(OP_PUTFH);
        WRITE32(len);
        WRITEMEM(fh->data, len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_putrootfh(struct xdr_stream *xdr)
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        
-        RESERVE_SPACE(4);
-        WRITE32(OP_PUTROOTFH);
-        return 0;
+        RESERVE_SPACE(4);
+        WRITE32(OP_PUTROOTFH);
+        hdr->nops++;
 }
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,7 +1143,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
                WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 }
-static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
+static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1165,11 +1155,10 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
        RESERVE_SPACE(12);
        WRITE64(args->offset);
        WRITE32(args->count);
+        hdr->nops++;
-        return 0;
 }
-static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req)
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
        uint32_t attrs[2] = {
                FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
@@ -1191,6 +1180,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        WRITE32(attrs[0] & readdir->bitmask[0]);
        WRITE32(attrs[1] & readdir->bitmask[1]);
+        hdr->nops++;
        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
@@ -1198,21 +1188,18 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                        ((u32 *)readdir->verifier.data)[1],
                        attrs[0] & readdir->bitmask[0],
                        attrs[1] & readdir->bitmask[1]);
-        return 0;
 }
-static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req)
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_READLINK);
+        hdr->nops++;
-        return 0;
 }
-static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1220,11 +1207,10 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_REMOVE);
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1232,38 +1218,35 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
        WRITE32(OP_RENAME);
        WRITE32(oldname->len);
        WRITEMEM(oldname->name, oldname->len);
-        
        RESERVE_SPACE(4 + newname->len);
        WRITE32(newname->len);
        WRITEMEM(newname->name, newname->len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
+static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(12);
        WRITE32(OP_RENEW);
        WRITE64(client_stateid->cl_clientid);
+        hdr->nops++;
-        return 0;
 }
-static int
+static void
-encode_restorefh(struct xdr_stream *xdr)
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_RESTOREFH);
+        hdr->nops++;
-        return 0;
 }
 static int
-encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
+encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1278,36 +1261,32 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
        RESERVE_SPACE(4);
        WRITE32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+        hdr->nops++;
        return 0;
 }
-static int
+static void
-encode_savefh(struct xdr_stream *xdr)
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_SAVEFH);
+        hdr->nops++;
-        return 0;
 }
-static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
 {
-        int status;
        __be32 *p;
-        
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-        WRITE32(OP_SETATTR);
-        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
-        if ((status = encode_attrs(xdr, arg->iap, server)))
+        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-                return status;
+        WRITE32(OP_SETATTR);
+        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
+        hdr->nops++;
+        encode_attrs(xdr, arg->iap, server);
 }
-static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1322,23 +1301,21 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
        encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
        RESERVE_SPACE(4);
        WRITE32(setclientid->sc_cb_ident);
+        hdr->nops++;
-        return 0;
 }
-static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
-        WRITE32(OP_SETCLIENTID_CONFIRM);
-        WRITE64(client_state->cl_clientid);
-        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
-        return 0;
+        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+        WRITE32(OP_SETCLIENTID_CONFIRM);
+        WRITE64(client_state->cl_clientid);
+        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+        hdr->nops++;
 }
-static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1353,11 +1330,10 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
        WRITE32(args->count);
        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+        hdr->nops++;
-        return 0;
 }
-static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1365,8 +1341,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
        WRITE32(OP_DELEGRETURN);
        WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
-        return 0;
+        hdr->nops++;
 }
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
@@ -1379,21 +1354,16 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status != 0)
+        encode_access(&xdr, args->access, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_access(&xdr, args->access);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1403,21 +1373,17 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 4,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_lookup(&xdr, args->name, &hdr);
-        if ((status = encode_lookup(&xdr, args->name)) != 0)
+        encode_getfh(&xdr, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        if ((status = encode_getfh(&xdr)) != 0)
+        encode_nops(&hdr);
-                goto out;
+        return 0;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1427,18 +1393,16 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putrootfh(&xdr)) != 0)
+        encode_putrootfh(&xdr, &hdr);
-                goto out;
+        encode_getfh(&xdr, &hdr);
-        if ((status = encode_getfh(&xdr)) == 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                status = encode_getfattr(&xdr, args->bitmask);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1448,19 +1412,16 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) != 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                goto out;
+        encode_remove(&xdr, &args->name, &hdr);
-        if ((status = encode_remove(&xdr, &args->name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_getfattr(&xdr, args->bitmask);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1470,27 +1431,20 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->old_dir)) != 0)
+        encode_putfh(&xdr, args->old_dir, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_putfh(&xdr, args->new_dir, &hdr);
-                goto out;
+        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
-        if ((status = encode_putfh(&xdr, args->new_dir)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1500,27 +1454,20 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) != 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_link(&xdr, args->name, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_link(&xdr, args->name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1530,27 +1477,20 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_create(&xdr, args, &hdr);
-                goto out;
+        encode_getfh(&xdr, &hdr);
-        if ((status = encode_create(&xdr, args)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_getfh(&xdr)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1568,15 +1508,15 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) == 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                status = encode_getfattr(&xdr, args->bitmask);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        return status;
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -1584,23 +1524,18 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 */
 static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
-        };
+        };
-        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_putfh(&xdr, args->fh, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_close(&xdr, args, &hdr);
-        if(status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_close(&xdr, args);
+        return 0;
-        if (status != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1610,33 +1545,20 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_savefh(&xdr, &hdr);
-                goto out;
+        encode_open(&xdr, args, &hdr);
-        status = encode_savefh(&xdr);
+        encode_getfh(&xdr, &hdr);
-        if (status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        status = encode_open(&xdr, args);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        if (status)
+        encode_nops(&hdr);
-                goto out;
+        return 0;
-        status = encode_getfh(&xdr);
-        if (status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-        if (status)
-                goto out;
-        status = encode_restorefh(&xdr);
-        if (status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1646,18 +1568,15 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_open_confirm(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_open_confirm(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1667,21 +1586,16 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_open(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_open(&xdr, args);
+        encode_nops(&hdr);
-        if (status)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1691,21 +1605,16 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_open_downgrade(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_open_downgrade(&xdr, args);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1715,18 +1624,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_lock(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_lock(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1736,18 +1642,15 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_lockt(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_lockt(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1757,18 +1660,15 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_locku(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_locku(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1778,18 +1678,15 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        unsigned int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_readlink(&xdr, args, req, &hdr);
-                goto out;
-        status = encode_readlink(&xdr, args, req);
        /* set up reply kvec
         *    toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1798,9 +1695,8 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
                        args->pgbase, args->pglen);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1810,18 +1706,15 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_readdir(&xdr, args, req, &hdr);
-                goto out;
-        status = encode_readdir(&xdr, args, req);
        /* set up reply kvec
         *    toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1833,9 +1726,8 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
        dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
                        __func__, replen, args->pages,
                        args->pgbase, args->count);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1846,18 +1738,14 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int replen, status;
+        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_read(&xdr, args, &hdr);
-                goto out;
-        status = encode_read(&xdr, args);
-        if (status)
-                goto out;
        /* set up reply kvec
         *    toplevel status + taglen=0 + rescount + OP_PUTFH + status
@@ -1867,33 +1755,27 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
 * Encode an SETATTR request
 */
 static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
-        };
+        };
-        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_putfh(&xdr, args->fh, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_setattr(&xdr, args, args->server, &hdr);
-        if(status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_setattr(&xdr, args, args->server);
+        return 0;
-        if(status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1906,22 +1788,21 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
        struct xdr_stream xdr;
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int replen, status;
+        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
-                goto out;
-        status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
        /* set up reply buffer: */
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                args->acl_pages, args->acl_pgbase, args->acl_len);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1931,22 +1812,17 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_write(&xdr, args, &hdr);
-                goto out;
-        status = encode_write(&xdr, args);
-        if (status)
-                goto out;
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        status = encode_getfattr(&xdr, args->bitmask);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1956,21 +1832,16 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_commit(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_commit(&xdr, args);
+        encode_nops(&hdr);
-        if (status)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1980,16 +1851,15 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (!status)
+        encode_fsinfo(&xdr, args->bitmask, &hdr);
-                status = encode_fsinfo(&xdr, args->bitmask);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1999,17 +1869,16 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (!status)
+        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-                status = encode_getattr_one(&xdr,
+                           &hdr);
-                                args->bitmask[0] & nfs4_pathconf_bitmap[0]);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2019,18 +1888,16 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status == 0)
+        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-                status = encode_getattr_two(&xdr,
+                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
-                                args->bitmask[0] & nfs4_statfs_bitmap[0],
+        encode_nops(&hdr);
-                                args->bitmask[1] & nfs4_statfs_bitmap[1]);
+        return 0;
-        return status;
 }
 /*
@@ -2040,19 +1907,18 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, fhandle);
+        encode_putfh(&xdr, fhandle, &hdr);
-        if (status == 0)
+        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-                status = encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+                           FATTR4_WORD0_LINK_SUPPORT|
-                                FATTR4_WORD0_LINK_SUPPORT|
+                           FATTR4_WORD0_SYMLINK_SUPPORT|
-                                FATTR4_WORD0_SYMLINK_SUPPORT|
+                           FATTR4_WORD0_ACLSUPPORT, &hdr);
-                                FATTR4_WORD0_ACLSUPPORT);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2062,12 +1928,14 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 1,
+                .nops   = 0,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        return encode_renew(&xdr, clp);
+        encode_renew(&xdr, clp, &hdr);
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -2077,12 +1945,14 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 1,
+                .nops   = 0,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        return encode_setclientid(&xdr, sc);
+        encode_setclientid(&xdr, sc, &hdr);
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -2092,19 +1962,17 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_setclientid_confirm(&xdr, clp);
+        encode_setclientid_confirm(&xdr, clp, &hdr);
-        if (!status)
+        encode_putrootfh(&xdr, &hdr);
-                status = encode_putrootfh(&xdr);
+        encode_fsinfo(&xdr, lease_bitmap, &hdr);
-        if (!status)
+        encode_nops(&hdr);
-                status = encode_fsinfo(&xdr, lease_bitmap);
+        return 0;
-        return status;
 }
 /*
@@ -2114,21 +1982,16 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fhandle);
+        encode_putfh(&xdr, args->fhandle, &hdr);
-        if (status != 0)
+        encode_delegreturn(&xdr, args->stateid, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_delegreturn(&xdr, args->stateid);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -2138,20 +2001,17 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_lookup(&xdr, args->name, &hdr);
-        if ((status = encode_lookup(&xdr, args->name)) != 0)
+        encode_fs_locations(&xdr, args->bitmask, &hdr);
-                goto out;
-        if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
-                goto out;
        /* set up reply
         *   toplevel_status + OP_PUTFH + status
         *   + OP_LOOKUP + status + OP_GETATTR + status = 7
@@ -2159,8 +2019,8 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
        replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
                        0, PAGE_SIZE);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2217,11 +2077,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
        READ_BUF(8);
        READ32(hdr->status);
        READ32(hdr->taglen);
-        
        READ_BUF(hdr->taglen + 4);
        hdr->tag = (char *)p;
        p += XDR_QUADLEN(hdr->taglen);
        READ32(hdr->nops);
+        if (unlikely(hdr->nops < 1))
+                return nfs4_stat_to_errno(hdr->status);
        return 0;
 }
@@ -3047,8 +2909,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3070,14 +2931,13 @@ xdr_error:
        dprintk("%s: xdr returned %d!\n", __func__, -status);
        return status;
 }
-        
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
-        
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3107,10 +2967,9 @@ xdr_error:
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
-        
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3256,7 +3115,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 {
        int status;
-        
        status = decode_op_hdr(xdr, OP_LINK);
        if (status)
                return status;
@@ -3344,27 +3203,27 @@ static int decode_lookup(struct xdr_stream *xdr)
 /* This is too sick! */
 static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 {
-        __be32 *p;
+        __be32 *p;
        uint32_t limit_type, nblocks, blocksize;
        READ_BUF(12);
        READ32(limit_type);
        switch (limit_type) {
-                case 1:
+        case 1:
-                        READ64(*maxsize);
+                READ64(*maxsize);
-                        break;
+                break;
-                case 2:
+        case 2:
-                        READ32(nblocks);
+                READ32(nblocks);
-                        READ32(blocksize);
+                READ32(blocksize);
-                        *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+                *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
        }
        return 0;
 }
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        __be32 *p;
+        __be32 *p;
-        uint32_t delegation_type;
+        uint32_t delegation_type;
        READ_BUF(4);
        READ32(delegation_type);
@@ -3375,13 +3234,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
        READ_BUF(NFS4_STATEID_SIZE+4);
        COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
        READ32(res->do_recall);
        switch (delegation_type) {
-                case NFS4_OPEN_DELEGATE_READ:
+        case NFS4_OPEN_DELEGATE_READ:
-                        res->delegation_type = FMODE_READ;
+                res->delegation_type = FMODE_READ;
-                        break;
+                break;
-                case NFS4_OPEN_DELEGATE_WRITE:
+        case NFS4_OPEN_DELEGATE_WRITE:
-                        res->delegation_type = FMODE_WRITE|FMODE_READ;
+                res->delegation_type = FMODE_WRITE|FMODE_READ;
-                        if (decode_space_limit(xdr, &res->maxsize) < 0)
+                if (decode_space_limit(xdr, &res->maxsize) < 0)
                                return -EIO;
        }
        return decode_ace(xdr, NULL, res->server->nfs_client);
@@ -3389,27 +3249,27 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        __be32 *p;
+        __be32 *p;
        uint32_t savewords, bmlen, i;
-        int status;
+        int status;
-        status = decode_op_hdr(xdr, OP_OPEN);
+        status = decode_op_hdr(xdr, OP_OPEN);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (status)
-                return status;
+                return status;
-        READ_BUF(NFS4_STATEID_SIZE);
+        READ_BUF(NFS4_STATEID_SIZE);
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        decode_change_info(xdr, &res->cinfo);
+        decode_change_info(xdr, &res->cinfo);
-        READ_BUF(8);
+        READ_BUF(8);
-        READ32(res->rflags);
+        READ32(res->rflags);
-        READ32(bmlen);
+        READ32(bmlen);
-        if (bmlen > 10)
+        if (bmlen > 10)
-                goto xdr_error;
+                goto xdr_error;
-        READ_BUF(bmlen << 2);
+        READ_BUF(bmlen << 2);
        savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
        for (i = 0; i < savewords; ++i)
                READ32(res->attrset[i]);
@@ -3424,17 +3284,17 @@ xdr_error:
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-        __be32 *p;
+        __be32 *p;
        int status;
-        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (status)
-                return status;
+                return status;
-        READ_BUF(NFS4_STATEID_SIZE);
+        READ_BUF(NFS4_STATEID_SIZE);
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
+        return 0;
 }
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -3562,7 +3422,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
                dprintk("NFS: readdir reply truncated!\n");
                entry[1] = 1;
        }
-out:    
+out:
        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 short_pkt:
@@ -3718,7 +3578,6 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
        uint32_t bmlen;
        int status;
-        
        status = decode_op_hdr(xdr, OP_SETATTR);
        if (status)
                return status;
@@ -3738,7 +3597,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
        READ32(opnum);
        if (opnum != OP_SETCLIENTID) {
                dprintk("nfs: decode_setclientid: Server returned operation"
-                                " %d\n", opnum);
+                        " %d\n", opnum);
                return -EIO;
        }
        READ32(nfserr);
@@ -3792,34 +3651,34 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 }
 /*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+/*
 * Decode OPEN_DOWNGRADE response
 */
 static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(&xdr, res);
        if (status != 0)
                goto out;
        decode_getfattr(&xdr, res->fattr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
- * END OF "GENERIC" DECODE ROUTINES.
- */
-/*
 * Decode ACCESS response
 */
 static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
@@ -3827,7 +3686,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3850,7 +3709,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3873,7 +3732,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3893,7 +3752,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3914,7 +3773,7 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3944,7 +3803,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3977,7 +3836,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -4014,7 +3873,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
@@ -4025,7 +3884,6 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        status = decode_getfattr(&xdr, res->fattr, res->server);
 out:
        return status;
 }
 /*
@@ -4034,21 +3892,20 @@ out:
 static int
 nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
-        };
+        };
-        int status;
+        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        status = encode_setacl(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_setacl(&xdr, args);
+        return status;
-out:
-        return status;
 }
 /*
 * Decode SETACL response
 */
@@ -4099,18 +3956,18 @@ out:
 */
 static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(&xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -4121,7 +3978,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         */
        decode_getfattr(&xdr, res->fattr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4129,23 +3986,23 @@ out:
 */
 static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(&xdr);
+        if (status)
+                goto out;
+        status = decode_open(&xdr, res);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
-        if (status)
-                goto out;
        if (decode_getfh(&xdr, &res->fh) != 0)
                goto out;
        if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
@@ -4154,7 +4011,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
                goto out;
        decode_getfattr(&xdr, res->dir_attr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4162,20 +4019,20 @@ out:
 */
 static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(&xdr, res);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4183,23 +4040,23 @@ out:
 */
 static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(&xdr, res);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
        decode_getfattr(&xdr, res->f_attr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4207,25 +4064,25 @@ out:
 */
 static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_setattr(&xdr, res);
+        status = decode_setattr(&xdr, res);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
        status = decode_getfattr(&xdr, res->fattr, res->server);
        if (status == NFS4ERR_DELAY)
                status = 0;
 out:
-        return status;
+        return status;
 }
 /*
@@ -4421,8 +4278,6 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
                status = decode_putfh(&xdr);
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4511,8 +4366,6 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
                status = decode_setclientid(&xdr, clp);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4533,8 +4386,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
                status = decode_putrootfh(&xdr);
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4715,7 +4566,7 @@ nfs4_stat_to_errno(int stat)
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
        .p_name   = #proc,                                      \
-    }
+}
 struct rpc_procinfo     nfs4_procedures[] = {
  PROC(READ,            enc_read,       dec_read),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 8478fc25dae..d9ef602fbc5 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,6 +86,8 @@
 #include <net/ipconfig.h>
 #include <linux/parser.h>
+#include "internal.h"
 /* Define this to allow debugging output */
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
@@ -100,7 +102,7 @@ static char nfs_root_name[256] __initdata = "";
 static __be32 servaddr __initdata = 0;
 /* Name of directory to mount */
-static char nfs_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
 /* NFS-related data */
 static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -312,7 +314,7 @@ static int __init root_nfs_name(char *name)
                printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
                return -1;
        }
-        sprintf(nfs_path, buf, cp);
+        sprintf(nfs_export_path, buf, cp);
        return 1;
 }
@@ -329,7 +331,7 @@ static int __init root_nfs_addr(void)
        }
        snprintf(nfs_data.hostname, sizeof(nfs_data.hostname),
-                 "%u.%u.%u.%u", NIPQUAD(servaddr));
+                 "%pI4", &servaddr);
        return 0;
 }
@@ -340,7 +342,7 @@ static int __init root_nfs_addr(void)
 static void __init root_nfs_print(void)
 {
        printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
-                nfs_path, nfs_data.hostname);
+                nfs_export_path, nfs_data.hostname);
        printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
                nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
        printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
@@ -421,8 +423,8 @@ static int __init root_nfs_getport(int program, int version, int proto)
 {
        struct sockaddr_in sin;
-        printk(KERN_NOTICE "Looking up port of RPC %d/%d on %u.%u.%u.%u\n",
+        printk(KERN_NOTICE "Looking up port of RPC %d/%d on %pI4\n",
-                program, version, NIPQUAD(servaddr));
+                program, version, &servaddr);
        set_sockaddr(&sin, servaddr, 0);
        return rpcb_getport_sync(&sin, program, version, proto);
 }
@@ -485,18 +487,23 @@ static int __init root_nfs_get_handle(void)
 {
        struct nfs_fh fh;
        struct sockaddr_in sin;
+        struct nfs_mount_request request = {
+                .sap            = (struct sockaddr *)&sin,
+                .salen          = sizeof(sin),
+                .dirpath        = nfs_export_path,
+                .version        = (nfs_data.flags & NFS_MOUNT_VER3) ?
+                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
+                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
+                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
+                .fh             = &fh,
+        };
        int status;
-        int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
-                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
-        int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
-                                        NFS_MNT3_VERSION : NFS_MNT_VERSION;
        set_sockaddr(&sin, servaddr, htons(mount_port));
-        status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL,
+        status = nfs_mount(&request);
-                           nfs_path, version, protocol, &fh);
        if (status < 0)
                printk(KERN_ERR "Root-NFS: Server returned error %d "
-                                "while mounting %s\n", status, nfs_path);
+                                "while mounting %s\n", status, nfs_export_path);
        else {
                nfs_data.root.size = fh.size;
                memcpy(nfs_data.root.data, fh.data, fh.size);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 40d17987d0e..f856004bb7f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -533,12 +533,6 @@ readpage_async_filler(void *data, struct page *page)
        unsigned int len;
        int error;
-        error = nfs_wb_page(inode, page);
-        if (error)
-                goto out_unlock;
-        if (PageUptodate(page))
-                goto out_unlock;
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f48db679a1c..d6686f4786d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -75,6 +75,7 @@ enum {
        Opt_acl, Opt_noacl,
        Opt_rdirplus, Opt_nordirplus,
        Opt_sharecache, Opt_nosharecache,
+        Opt_resvport, Opt_noresvport,
        /* Mount options that take integer arguments */
        Opt_port,
@@ -129,6 +130,8 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_nordirplus, "nordirplus" },
        { Opt_sharecache, "sharecache" },
        { Opt_nosharecache, "nosharecache" },
+        { Opt_resvport, "resvport" },
+        { Opt_noresvport, "noresvport" },
        { Opt_port, "port=%u" },
        { Opt_rsize, "rsize=%u" },
@@ -462,14 +465,12 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
        switch (sap->sa_family) {
        case AF_INET: {
                struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-                seq_printf(m, ",mountaddr=" NIPQUAD_FMT,
+                seq_printf(m, ",mountaddr=%pI4", &sin->sin_addr.s_addr);
-                                NIPQUAD(sin->sin_addr.s_addr));
                break;
        }
        case AF_INET6: {
                struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-                seq_printf(m, ",mountaddr=" NIP6_FMT,
+                seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr);
-                                NIP6(sin6->sin6_addr));
                break;
        }
        default:
@@ -514,7 +515,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                { NFS_MOUNT_NONLM, ",nolock", "" },
                { NFS_MOUNT_NOACL, ",noacl", "" },
                { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
-                { NFS_MOUNT_UNSHARED, ",nosharecache", ""},
+                { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+                { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
                { 0, NULL, NULL }
        };
        const struct proc_nfs_info *nfs_infop;
@@ -1035,6 +1037,12 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_nosharecache:
                        mnt->flags |= NFS_MOUNT_UNSHARED;
                        break;
+                case Opt_resvport:
+                        mnt->flags &= ~NFS_MOUNT_NORESVPORT;
+                        break;
+                case Opt_noresvport:
+                        mnt->flags |= NFS_MOUNT_NORESVPORT;
+                        break;
                /*
                 * options that take numeric values
@@ -1329,8 +1337,14 @@ out_security_failure:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         struct nfs_fh *root_fh)
 {
-        struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
+        struct nfs_mount_request request = {
-        char *hostname;
+                .sap            = (struct sockaddr *)
+                                                &args->mount_server.address,
+                .dirpath        = args->nfs_server.export_path,
+                .protocol       = args->mount_server.protocol,
+                .fh             = root_fh,
+                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
+        };
        int status;
        if (args->mount_server.version == 0) {
@@ -1339,42 +1353,38 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                else
                        args->mount_server.version = NFS_MNT_VERSION;
        }
+        request.version = args->mount_server.version;
        if (args->mount_server.hostname)
-                hostname = args->mount_server.hostname;
+                request.hostname = args->mount_server.hostname;
        else
-                hostname = args->nfs_server.hostname;
+                request.hostname = args->nfs_server.hostname;
        /*
         * Construct the mount server's address.
         */
        if (args->mount_server.address.ss_family == AF_UNSPEC) {
-                memcpy(sap, &args->nfs_server.address,
+                memcpy(request.sap, &args->nfs_server.address,
                       args->nfs_server.addrlen);
                args->mount_server.addrlen = args->nfs_server.addrlen;
        }
+        request.salen = args->mount_server.addrlen;
        /*
         * autobind will be used if mount_server.port == 0
         */
-        nfs_set_port(sap, args->mount_server.port);
+        nfs_set_port(request.sap, args->mount_server.port);
        /*
         * Now ask the mount server to map our export path
         * to a file handle.
         */
-        status = nfs_mount(sap,
+        status = nfs_mount(&request);
-                           args->mount_server.addrlen,
-                           hostname,
-                           args->nfs_server.export_path,
-                           args->mount_server.version,
-                           args->mount_server.protocol,
-                           root_fh);
        if (status == 0)
                return 0;
        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
-                        hostname, status);
+                        request.hostname, status);
        return status;
 }
@@ -2421,7 +2431,7 @@ static void nfs4_kill_super(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
-        nfs_return_all_delegations(sb);
+        nfs_super_return_all_delegations(sb);
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index c11f5375d7c..04133aacb1e 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -29,8 +29,8 @@
 MODULE_LICENSE("GPL");
-EXPORT_SYMBOL(nfsacl_encode);
+EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL(nfsacl_decode);
+EXPORT_SYMBOL_GPL(nfsacl_decode);
 struct nfsacl_encode_desc {
        struct xdr_array2_desc desc;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index aed8145d908..b27451909df 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -10,6 +10,8 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/syscall.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
 #include <linux/linkage.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
@@ -36,12 +38,14 @@ static struct file *do_open(char *name, int flags)
                return ERR_PTR(error);
        if (flags == O_RDWR)
-                error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
+                error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+                                           FMODE_READ|FMODE_WRITE);
        else
-                error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
+                error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
        if (!error)
-                return dentry_open(nd.path.dentry, nd.path.mnt, flags);
+                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
+                                   current_cred());
        path_put(&nd.path);
        return ERR_PTR(error);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 294992e9bf6..0184fe9b514 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -27,53 +27,70 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-        struct svc_cred cred = rqstp->rq_cred;
+        struct group_info *rqgi;
+        struct group_info *gi;
+        struct cred *new;
        int i;
        int flags = nfsexp_flags(rqstp, exp);
        int ret;
+        /* discard any old override before preparing the new set */
+        revert_creds(get_cred(current->real_cred));
+        new = prepare_creds();
+        if (!new)
+                return -ENOMEM;
+        new->fsuid = rqstp->rq_cred.cr_uid;
+        new->fsgid = rqstp->rq_cred.cr_gid;
+        rqgi = rqstp->rq_cred.cr_group_info;
        if (flags & NFSEXP_ALLSQUASH) {
-                cred.cr_uid = exp->ex_anon_uid;
+                new->fsuid = exp->ex_anon_uid;
-                cred.cr_gid = exp->ex_anon_gid;
+                new->fsgid = exp->ex_anon_gid;
-                cred.cr_group_info = groups_alloc(0);
+                gi = groups_alloc(0);
        } else if (flags & NFSEXP_ROOTSQUASH) {
-                struct group_info *gi;
+                if (!new->fsuid)
-                if (!cred.cr_uid)
+                        new->fsuid = exp->ex_anon_uid;
-                        cred.cr_uid = exp->ex_anon_uid;
+                if (!new->fsgid)
-                if (!cred.cr_gid)
+                        new->fsgid = exp->ex_anon_gid;
-                        cred.cr_gid = exp->ex_anon_gid;
-                gi = groups_alloc(cred.cr_group_info->ngroups);
-                if (gi)
-                        for (i = 0; i < cred.cr_group_info->ngroups; i++) {
-                                if (!GROUP_AT(cred.cr_group_info, i))
-                                        GROUP_AT(gi, i) = exp->ex_anon_gid;
-                                else
-                                        GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
-                        }
-                cred.cr_group_info = gi;
-        } else
-                get_group_info(cred.cr_group_info);
-        if (cred.cr_uid != (uid_t) -1)
-                current->fsuid = cred.cr_uid;
-        else
-                current->fsuid = exp->ex_anon_uid;
-        if (cred.cr_gid != (gid_t) -1)
-                current->fsgid = cred.cr_gid;
-        else
-                current->fsgid = exp->ex_anon_gid;
-        if (!cred.cr_group_info)
+                gi = groups_alloc(rqgi->ngroups);
-                return -ENOMEM;
+                if (!gi)
-        ret = set_current_groups(cred.cr_group_info);
+                        goto oom;
-        put_group_info(cred.cr_group_info);
-        if ((cred.cr_uid)) {
+                for (i = 0; i < rqgi->ngroups; i++) {
-                current->cap_effective =
+                        if (!GROUP_AT(rqgi, i))
-                        cap_drop_nfsd_set(current->cap_effective);
+                                GROUP_AT(gi, i) = exp->ex_anon_gid;
+                        else
+                                GROUP_AT(gi, i) = GROUP_AT(rqgi, i);
+                }
        } else {
-                current->cap_effective =
+                gi = get_group_info(rqgi);
-                        cap_raise_nfsd_set(current->cap_effective,
-                                           current->cap_permitted);
        }
+        if (new->fsuid == (uid_t) -1)
+                new->fsuid = exp->ex_anon_uid;
+        if (new->fsgid == (gid_t) -1)
+                new->fsgid = exp->ex_anon_gid;
+        ret = set_groups(new, gi);
+        put_group_info(gi);
+        if (!ret)
+                goto error;
+        if (new->uid)
+                new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
+        else
+                new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
+                                                        new->cap_permitted);
+        put_cred(override_creds(new));
+        return 0;
+oom:
+        ret = -ENOMEM;
+error:
+        abort_creds(new);
        return ret;
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 094747a1227..6d7d8c02c19 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -358,6 +358,7 @@ static struct rpc_program cb_program = {
                .nrvers         = ARRAY_SIZE(nfs_cb_version),
                .version        = nfs_cb_version,
                .stats          = &cb_stats,
+                .pipe_dir_name  = "/nfsd4_cb",
 };
 /* Reference counting, callback cleanup, etc., all look racy as heck.
@@ -382,8 +383,9 @@ static int do_probe_callback(void *data)
                .program        = &cb_program,
                .prognumber     = cb->cb_prog,
                .version        = nfs_cb_version[1]->number,
-                .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
+                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+                .client_name    = clp->cl_principal,
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
@@ -392,6 +394,11 @@ static int do_probe_callback(void *data)
        struct rpc_clnt *client;
        int status;
+        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+                status = nfserr_cb_path_down;
+                goto out_err;
+        }
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
        addr.sin_family = AF_INET;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index b79ec930d9f..0f9d6efaa62 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -54,20 +54,26 @@
 static struct path rec_dir;
 static int rec_dir_init = 0;
-static void
+static int
-nfs4_save_user(uid_t *saveuid, gid_t *savegid)
+nfs4_save_creds(const struct cred **original_creds)
 {
-        *saveuid = current->fsuid;
+        struct cred *new;
-        *savegid = current->fsgid;
-        current->fsuid = 0;
+        new = prepare_creds();
-        current->fsgid = 0;
+        if (!new)
+                return -ENOMEM;
+        new->fsuid = 0;
+        new->fsgid = 0;
+        *original_creds = override_creds(new);
+        put_cred(new);
+        return 0;
 }
 static void
-nfs4_reset_user(uid_t saveuid, gid_t savegid)
+nfs4_reset_creds(const struct cred *original)
 {
-        current->fsuid = saveuid;
+        revert_creds(original);
-        current->fsgid = savegid;
 }
 static void
@@ -129,10 +135,9 @@ nfsd4_sync_rec_dir(void)
 int
 nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
+        const struct cred *original_cred;
        char *dname = clp->cl_recdir;
        struct dentry *dentry;
-        uid_t uid;
-        gid_t gid;
        int status;
        dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
@@ -140,7 +145,9 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
        if (!rec_dir_init || clp->cl_firststate)
                return 0;
-        nfs4_save_user(&uid, &gid);
+        status = nfs4_save_creds(&original_cred);
+        if (status < 0)
+                return status;
        /* lock the parent */
        mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
@@ -168,7 +175,7 @@ out_unlock:
                clp->cl_firststate = 1;
                nfsd4_sync_rec_dir();
        }
-        nfs4_reset_user(uid, gid);
+        nfs4_reset_creds(original_cred);
        dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
        return status;
 }
@@ -211,26 +218,29 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
 static int
 nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
+        const struct cred *original_cred;
        struct file *filp;
        struct dentry_list_arg dla = {
                .parent = dir,
        };
        struct list_head *dentries = &dla.dentries;
        struct dentry_list *child;
-        uid_t uid;
-        gid_t gid;
        int status;
        if (!rec_dir_init)
                return 0;
-        nfs4_save_user(&uid, &gid);
+        status = nfs4_save_creds(&original_cred);
+        if (status < 0)
+                return status;
        INIT_LIST_HEAD(dentries);
-        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY);
+        filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
+                           current_cred());
        status = PTR_ERR(filp);
        if (IS_ERR(filp))
                goto out;
+        INIT_LIST_HEAD(dentries);
        status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
        fput(filp);
        while (!list_empty(dentries)) {
@@ -249,7 +259,7 @@ out:
                dput(child->dentry);
                kfree(child);
        }
-        nfs4_reset_user(uid, gid);
+        nfs4_reset_creds(original_cred);
        return status;
 }
@@ -311,8 +321,7 @@ out:
 void
 nfsd4_remove_clid_dir(struct nfs4_client *clp)
 {
-        uid_t uid;
+        const struct cred *original_cred;
-        gid_t gid;
        int status;
        if (!rec_dir_init || !clp->cl_firststate)
@@ -322,9 +331,13 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
        if (status)
                goto out;
        clp->cl_firststate = 0;
-        nfs4_save_user(&uid, &gid);
+        status = nfs4_save_creds(&original_cred);
+        if (status < 0)
+                goto out;
        status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
-        nfs4_reset_user(uid, gid);
+        nfs4_reset_creds(original_cred);
        if (status == 0)
                nfsd4_sync_rec_dir();
        mnt_drop_write(rec_dir.mnt);
@@ -401,16 +414,21 @@ nfsd4_recdir_load(void) {
 void
 nfsd4_init_recdir(char *rec_dirname)
 {
-        uid_t                   uid = 0;
+        const struct cred *original_cred;
-        gid_t                   gid = 0;
+        int status;
-        int                     status;
        printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
                        rec_dirname);
        BUG_ON(rec_dir_init);
-        nfs4_save_user(&uid, &gid);
+        status = nfs4_save_creds(&original_cred);
+        if (status < 0) {
+                printk("NFSD: Unable to change credentials to find recovery"
+                       " directory: error %d\n",
+                       status);
+                return;
+        }
        status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
                        &rec_dir);
@@ -420,7 +438,7 @@ nfsd4_init_recdir(char *rec_dirname)
        if (!status)
                rec_dir_init = 1;
-        nfs4_reset_user(uid, gid);
+        nfs4_reset_creds(original_cred);
 }
 void
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1a052ac2bde..13e0e074dbb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -54,6 +54,7 @@
 #include <linux/mutex.h>
 #include <linux/lockd/bind.h>
 #include <linux/module.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -377,6 +378,7 @@ free_client(struct nfs4_client *clp)
        shutdown_callback_client(clp);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
+        kfree(clp->cl_principal);
        kfree(clp->cl_name.data);
        kfree(clp);
 }
@@ -696,6 +698,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        unsigned int            strhashval;
        struct nfs4_client      *conf, *unconf, *new;
        __be32                  status;
+        char                    *princ;
        char                    dname[HEXDIR_LEN];
        
        if (!check_name(clname))
@@ -719,8 +722,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                status = nfserr_clid_inuse;
                if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
                                || conf->cl_addr != sin->sin_addr.s_addr) {
-                        dprintk("NFSD: setclientid: string in use by client"
+                        dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
-                                "at %u.%u.%u.%u\n", NIPQUAD(conf->cl_addr));
+                                &conf->cl_addr);
                        goto out;
                }
        }
@@ -783,6 +786,15 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
        copy_verf(new, &clverifier);
        new->cl_addr = sin->sin_addr.s_addr;
+        new->cl_flavor = rqstp->rq_flavor;
+        princ = svc_gss_principal(rqstp);
+        if (princ) {
+                new->cl_principal = kstrdup(princ, GFP_KERNEL);
+                if (new->cl_principal == NULL) {
+                        free_client(new);
+                        goto out;
+                }
+        }
        copy_cred(&new->cl_cred, &rqstp->rq_cred);
        gen_confirm(new);
        gen_callback(new, setclid);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index e3f9783fdcf..77d7b8c531a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -330,7 +330,7 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
                return -EINVAL;
        /* get ipv4 address */
-        if (sscanf(fo_path, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) != 4)
+        if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
                return -EINVAL;
        if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
                return -EINVAL;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index cd25d91895a..f0da7d9c3a9 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -186,9 +186,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
                 * access control settings being in effect, we cannot
                 * fix that case easily.
                 */
-                current->cap_effective =
+                struct cred *new = prepare_creds();
-                        cap_raise_nfsd_set(current->cap_effective,
+                if (!new)
-                                           current->cap_permitted);
+                        return nfserrno(-ENOMEM);
+                new->cap_effective =
+                        cap_raise_nfsd_set(new->cap_effective,
+                                           new->cap_permitted);
+                put_cred(override_creds(new));
+                put_cred(new);
        } else {
                error = nfsd_setuser_and_check_port(rqstp, exp);
                if (error)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4433c8f0016..d1c5f787b36 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -671,6 +671,7 @@ __be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                        int access, struct file **filp)
 {
+        const struct cred *cred = current_cred();
        struct dentry   *dentry;
        struct inode    *inode;
        int             flags = O_RDONLY|O_LARGEFILE;
@@ -725,7 +726,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                DQUOT_INIT(inode);
        }
        *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
-                                flags);
+                            flags, cred);
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
 out_nfserr:
@@ -1169,7 +1170,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
         * send along the gid on create when it tries to implement
         * setgid directories via NFS:
         */
-        if (current->fsuid != 0)
+        if (current_fsuid() != 0)
                iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
        if (iap->ia_valid)
                return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
@@ -2001,7 +2002,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                IS_APPEND(inode)?       " append" : "",
                __mnt_is_readonly(exp->ex_path.mnt)?    " ro" : "");
        dprintk("      owner %d/%d user %d/%d\n",
-                inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
+                inode->i_uid, inode->i_gid, current_fsuid(), current_fsgid());
 #endif
        /* Normally we reject any write/sattr etc access on a read-only file
@@ -2044,7 +2045,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
         * with NFSv3.
         */
        if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
-            inode->i_uid == current->fsuid)
+            inode->i_uid == current_fsuid())
                return 0;
        /* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 00000000000..50914d7303c
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 00000000000..5a95b6010ce
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
+obj-y                   += dnotify/
+obj-y                   += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 00000000000..26adf5dfa64
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
+config DNOTIFY
+        bool "Dnotify support"
+        default y
+        help
+          Dnotify is a directory-based per-fd file change notification system
+          that uses signals to communicate events to user-space.  There exist
+          superior alternatives, but some applications may still rely on
+          dnotify.
+          If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 00000000000..f145251dcad
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY)           += dnotify.o
diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c
index 676073b8dda..b0aa2cde80b 100644
--- a/fs/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        dn->dn_next = inode->i_dnotify;
        inode->i_dnotify = dn;
        spin_unlock(&inode->i_lock);
-        if (filp->f_op && filp->f_op->dir_notify)
-                return filp->f_op->dir_notify(filp, arg);
        return 0;
 out_free:
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 00000000000..44679284102
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
+config INOTIFY
+        bool "Inotify file change notification support"
+        default y
+        ---help---
+          Say Y here to enable inotify support.  Inotify is a file change
+          notification system and a replacement for dnotify.  Inotify fixes
+          numerous shortcomings in dnotify and introduces several new features
+          including multiple file events, one-shot support, and unmount
+          notification.
+          For more information, see <file:Documentation/filesystems/inotify.txt>
+          If unsure, say Y.
+config INOTIFY_USER
+        bool "Inotify support for userspace"
+        depends on INOTIFY
+        default y
+        ---help---
+          Say Y here to enable inotify support for userspace, including the
+          associated system calls.  Inotify allows monitoring of both files and
+          directories via a single open fd.  Events are read from the file
+          descriptor, which is also select()- and poll()-able.
+          For more information, see <file:Documentation/filesystems/inotify.txt>
+          If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 00000000000..e290f3bb9d8
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INOTIFY)           += inotify.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c
index 7bbed1b8982..dae3f28f30d 100644
--- a/fs/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -428,11 +428,13 @@ void inotify_unmount_inodes(struct list_head *list)
                watches = &inode->inotify_watches;
                list_for_each_entry_safe(watch, next_w, watches, i_list) {
                        struct inotify_handle *ih= watch->ih;
+                        get_inotify_watch(watch);
                        mutex_lock(&ih->mutex);
                        ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
                                                 NULL, NULL);
                        inotify_remove_watch_locked(ih, watch);
                        mutex_unlock(&ih->mutex);
+                        put_inotify_watch(watch);
                }
                mutex_unlock(&inode->inotify_mutex);
                iput(inode);            
diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c
index d367e9b9286..400f8064a54 100644
--- a/fs/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
        struct mutex            ev_mutex;       /* protects event queue */
        struct mutex            up_mutex;       /* synchronizes watch updates */
        struct list_head        events;         /* list of queued events */
-        atomic_t                count;          /* reference count */
        struct user_struct      *user;          /* user who opened this dev */
        struct inotify_handle   *ih;            /* inotify handle */
        struct fasync_struct    *fa;            /* async notification */
+        atomic_t                count;          /* reference count */
        unsigned int            queue_size;     /* size of the queue (bytes) */
        unsigned int            event_count;    /* number of pending events */
        unsigned int            max_events;     /* maximum number of events */
@@ -601,7 +601,7 @@ asmlinkage long sys_inotify_init1(int flags)
                goto out_put_fd;
        }
-        user = get_uid(current->user);
+        user = get_current_user();
        if (unlikely(atomic_read(&user->inotify_devs) >=
                        inotify_max_user_instances)) {
                ret = -EMFILE;
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 52276c02f71..f8424874fa0 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -304,8 +304,8 @@ static int sc_seq_show(struct seq_file *seq, void *v)
                 * use of it here generates a warning with -Wbitwise */
                seq_printf(seq, "%p:\n"
                           "  krefs:           %d\n"
-                           "  sock:            %u.%u.%u.%u:%u -> "
+                           "  sock:            %pI4:%u -> "
-                                              "%u.%u.%u.%u:%u\n"
+                                              "%pI4:%u\n"
                           "  remote node:     %s\n"
                           "  page off:        %zu\n"
                           "  handshake ok:    %u\n"
@@ -319,8 +319,8 @@ static int sc_seq_show(struct seq_file *seq, void *v)
                           "  func type:       %u\n",
                           sc,
                           atomic_read(&sc->sc_kref.refcount),
-                           NIPQUAD(saddr), inet ? ntohs(sport) : 0,
+                           &saddr, inet ? ntohs(sport) : 0,
-                           NIPQUAD(daddr), inet ? ntohs(dport) : 0,
+                           &daddr, inet ? ntohs(dport) : 0,
                           sc->sc_node->nd_name,
                           sc->sc_page_off,
                           sc->sc_handshake_ok,
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 816a3f61330..70e8fa9e253 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -250,7 +250,7 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
 static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
 {
-        return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
+        return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
 }
 static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2bcf706d9dd..9fbe849f634 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1597,8 +1597,8 @@ static void o2net_start_connect(struct work_struct *work)
        ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
                              sizeof(myaddr));
        if (ret) {
-                mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n",
+                mlog(ML_ERROR, "bind failed with %d at address %pI4\n",
-                     ret, NIPQUAD(mynode->nd_ipv4_address));
+                     ret, &mynode->nd_ipv4_address);
                goto out;
        }
@@ -1790,17 +1790,16 @@ static int o2net_accept_one(struct socket *sock)
        node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
        if (node == NULL) {
-                mlog(ML_NOTICE, "attempt to connect from unknown node at "
+                mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
-                     "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr),
+                     &sin.sin_addr.s_addr, ntohs(sin.sin_port));
-                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
        if (o2nm_this_node() > node->nd_num) {
                mlog(ML_NOTICE, "unexpected connect attempted from a lower "
-                     "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n",
+                     "numbered node '%s' at " "%pI4:%d with num %u\n",
-                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+                     node->nd_name, &sin.sin_addr.s_addr,
                     ntohs(sin.sin_port), node->nd_num);
                ret = -EINVAL;
                goto out;
@@ -1810,8 +1809,8 @@ static int o2net_accept_one(struct socket *sock)
         * and tries to connect before we see their heartbeat */
        if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) {
                mlog(ML_CONN, "attempt to connect from node '%s' at "
-                     "%u.%u.%u.%u:%d but it isn't heartbeating\n",
+                     "%pI4:%d but it isn't heartbeating\n",
-                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+                     node->nd_name, &sin.sin_addr.s_addr,
                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
@@ -1827,8 +1826,8 @@ static int o2net_accept_one(struct socket *sock)
        spin_unlock(&nn->nn_lock);
        if (ret) {
                mlog(ML_NOTICE, "attempt to connect from node '%s' at "
-                     "%u.%u.%u.%u:%d but it already has an open connection\n",
+                     "%pI4:%d but it already has an open connection\n",
-                     node->nd_name, NIPQUAD(sin.sin_addr.s_addr),
+                     node->nd_name, &sin.sin_addr.s_addr,
                     ntohs(sin.sin_port));
                goto out;
        }
@@ -1924,15 +1923,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
        sock->sk->sk_reuse = 1;
        ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
        if (ret < 0) {
-                mlog(ML_ERROR, "unable to bind socket at %u.%u.%u.%u:%u, "
+                mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
-                     "ret=%d\n", NIPQUAD(addr), ntohs(port), ret);
+                     "ret=%d\n", &addr, ntohs(port), ret);
                goto out;
        }
        ret = sock->ops->listen(sock, 64);
        if (ret < 0) {
-                mlog(ML_ERROR, "unable to listen on %u.%u.%u.%u:%u, ret=%d\n",
+                mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
-                     NIPQUAD(addr), ntohs(port), ret);
+                     &addr, ntohs(port), ret);
        }
 out:
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index ba962d71b34..6f7a77d5402 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -339,8 +339,8 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
                ip = DLMFS_I(inode);
                inode->i_mode = mode;
-                inode->i_uid = current->fsuid;
+                inode->i_uid = current_fsuid();
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -365,8 +365,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
                return NULL;
        inode->i_mode = mode;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = current->fsgid;
+        inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f4967e634ff..2545e7402ef 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -421,13 +421,13 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-        fe->i_uid = cpu_to_le32(current->fsuid);
+        fe->i_uid = cpu_to_le32(current_fsuid());
        if (dir->i_mode & S_ISGID) {
                fe->i_gid = cpu_to_le32(dir->i_gid);
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
-                fe->i_gid = cpu_to_le32(current->fsgid);
+                fe->i_gid = cpu_to_le32(current_fsgid());
        fe->i_mode = cpu_to_le16(mode);
        if (S_ISCHR(mode) || S_ISBLK(mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5f180cf7abb..5e0c0d0aef7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
 #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask)                   \
        OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
-#define OCFS2_FEATURE_COMPAT_SUPP       OCFS2_FEATURE_COMPAT_BACKUP_SB
+#define OCFS2_FEATURE_COMPAT_SUPP       (OCFS2_FEATURE_COMPAT_BACKUP_SB \
+                                         | OCFS2_FEATURE_COMPAT_JBD2_SB)
 #define OCFS2_FEATURE_INCOMPAT_SUPP     (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
                                         | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
@@ -153,6 +154,11 @@
 #define OCFS2_FEATURE_COMPAT_BACKUP_SB          0x0001
 /*
+ * The filesystem will correctly handle journal feature bits.
+ */
+#define OCFS2_FEATURE_COMPAT_JBD2_SB            0x0002
+/*
 * Unwritten extents support.
 */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN       0x0001
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 054e2efb0b7..74d7367ade1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2645,9 +2645,9 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
                                return ret;
                        }
-                        i = xs->here - old_xh->xh_entries;
-                        xs->here = &xs->header->xh_entries[i];
                }
+                i = xs->here - old_xh->xh_entries;
+                xs->here = &xs->header->xh_entries[i];
        }
        return ret;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index cbf047a847c..6afe57c84f8 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -37,8 +37,8 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
        inode->i_ino = new_block;
        inode->i_mode = mode;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = current->fsgid;
+        inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_mapping->a_ops = &omfs_aops;
@@ -420,8 +420,8 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = sbi;
-        sbi->s_uid = current->uid;
+        sbi->s_uid = current_uid();
-        sbi->s_gid = current->gid;
+        sbi->s_gid = current_gid();
        sbi->s_dmask = sbi->s_fmask = current->fs->umask;
        if (!parse_options((char *) data, sbi))
diff --git a/fs/open.c b/fs/open.c
index 83cdb9dee0c..1cd7d40e999 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
                goto put_write_and_out;
        error = locks_verify_truncate(inode, NULL, length);
+        if (!error)
+                error = security_path_truncate(&path, length, 0);
        if (!error) {
                DQUOT_INIT(inode);
                error = do_truncate(path.dentry, length, 0, NULL);
@@ -329,6 +331,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        error = locks_verify_truncate(inode, file, length);
        if (!error)
+                error = security_path_truncate(&file->f_path, length,
+                                               ATTR_MTIME|ATTR_CTIME);
+        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
        fput(file);
@@ -425,39 +430,33 @@ out:
 */
 asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
+        const struct cred *old_cred;
+        struct cred *override_cred;
        struct path path;
        struct inode *inode;
-        int old_fsuid, old_fsgid;
-        kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
        int res;
        if (mode & ~S_IRWXO)    /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;
-        old_fsuid = current->fsuid;
+        override_cred = prepare_creds();
-        old_fsgid = current->fsgid;
+        if (!override_cred)
+                return -ENOMEM;
-        current->fsuid = current->uid;
+        override_cred->fsuid = override_cred->uid;
-        current->fsgid = current->gid;
+        override_cred->fsgid = override_cred->gid;
        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
-                /*
+                /* Clear the capabilities if we switch to a non-root user */
-                 * Clear the capabilities if we switch to a non-root user
+                if (override_cred->uid)
-                 */
+                        cap_clear(override_cred->cap_effective);
-#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
-                /*
-                 * FIXME: There is a race here against sys_capset.  The
-                 * capabilities can change yet we will restore the old
-                 * value below.  We should hold task_capabilities_lock,
-                 * but we cannot because user_path_at can sleep.
-                 */
-#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
-                if (current->uid)
-                        old_cap = cap_set_effective(__cap_empty_set);
                else
-                        old_cap = cap_set_effective(current->cap_permitted);
+                        override_cred->cap_effective =
+                                override_cred->cap_permitted;
        }
+        old_cred = override_creds(override_cred);
        res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
        if (res)
                goto out;
@@ -494,12 +493,8 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 out_path_release:
        path_put(&path);
 out:
-        current->fsuid = old_fsuid;
+        revert_creds(old_cred);
-        current->fsgid = old_fsgid;
+        put_cred(override_cred);
-        if (!issecure(SECURE_NO_SETUID_FIXUP))
-                cap_set_effective(old_cap);
        return res;
 }
@@ -792,7 +787,8 @@ static inline int __get_file_write_access(struct inode *inode,
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                        int flags, struct file *f,
-                                        int (*open)(struct inode *, struct file *))
+                                        int (*open)(struct inode *, struct file *),
+                                        const struct cred *cred)
 {
        struct inode *inode;
        int error;
@@ -816,7 +812,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_op = fops_get(inode->i_fop);
        file_move(f, &inode->i_sb->s_files);
-        error = security_dentry_open(f);
+        error = security_dentry_open(f, cred);
        if (error)
                goto cleanup_all;
@@ -891,6 +887,8 @@ cleanup_file:
 struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
                int (*open)(struct inode *, struct file *))
 {
+        const struct cred *cred = current_cred();
        if (IS_ERR(nd->intent.open.file))
                goto out;
        if (IS_ERR(dentry))
@@ -898,7 +896,7 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry
        nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
                                             nd->intent.open.flags - 1,
                                             nd->intent.open.file,
-                                             open);
+                                             open, cred);
 out:
        return nd->intent.open.file;
 out_err:
@@ -917,6 +915,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
 */
 struct file *nameidata_to_filp(struct nameidata *nd, int flags)
 {
+        const struct cred *cred = current_cred();
        struct file *filp;
        /* Pick up the filp from the open intent */
@@ -924,7 +923,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
        /* Has the filesystem initialised the file for us? */
        if (filp->f_path.dentry == NULL)
                filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp,
-                                     NULL);
+                                     NULL, cred);
        else
                path_put(&nd->path);
        return filp;
@@ -934,7 +933,8 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
 * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an
 * error.
 */
-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
+                         const struct cred *cred)
 {
        int error;
        struct file *f;
@@ -959,7 +959,7 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
                return ERR_PTR(error);
        }
-        return __dentry_open(dentry, mnt, flags, f, NULL);
+        return __dentry_open(dentry, mnt, flags, f, NULL, cred);
 }
 EXPORT_SYMBOL(dentry_open);
diff --git a/fs/pipe.c b/fs/pipe.c
index 7aea8b89baa..891697112f6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -899,8 +899,8 @@ static struct inode * get_pipe_inode(void)
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
-        inode->i_gid = current->fsgid;
+        inode->i_gid = current_fsgid();
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        return inode;
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
                goto err_fdr;
        fdw = error;
-        error = audit_fd_pair(fdr, fdw);
+        audit_fd_pair(fdr, fdw);
-        if (error < 0)
-                goto err_fdw;
        fd_install(fdr, fr);
        fd_install(fdw, fw);
        fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
        return 0;
- err_fdw:
-        put_unused_fd(fdw);
 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index aec931e0997..39df95a0ec2 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -217,11 +217,11 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                /* (May have been checked already) */
-                                if (inode->i_uid == current->fsuid)
+                                if (inode->i_uid == current_fsuid())
                                        goto check_perm;
                                break;
                        case ACL_USER:
-                                if (pa->e_id == current->fsuid)
+                                if (pa->e_id == current_fsuid())
                                        goto mask;
                                break;
                        case ACL_GROUP_OBJ:
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6af7fba7abb..7e4877d9dcb 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -159,6 +159,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
        struct group_info *group_info;
        int g;
        struct fdtable *fdt = NULL;
+        const struct cred *cred;
        pid_t ppid, tpid;
        rcu_read_lock();
@@ -170,6 +171,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                if (tracer)
                        tpid = task_pid_nr_ns(tracer, ns);
        }
+        cred = get_cred((struct cred *) __task_cred(p));
        seq_printf(m,
                "State:\t%s\n"
                "Tgid:\t%d\n"
@@ -182,8 +184,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                task_tgid_nr_ns(p, ns),
                pid_nr_ns(pid, ns),
                ppid, tpid,
-                p->uid, p->euid, p->suid, p->fsuid,
+                cred->uid, cred->euid, cred->suid, cred->fsuid,
-                p->gid, p->egid, p->sgid, p->fsgid);
+                cred->gid, cred->egid, cred->sgid, cred->fsgid);
        task_lock(p);
        if (p->files)
@@ -194,13 +196,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
                fdt ? fdt->max_fds : 0);
        rcu_read_unlock();
-        group_info = p->group_info;
+        group_info = cred->group_info;
-        get_group_info(group_info);
        task_unlock(p);
        for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
                seq_printf(m, "%d ", GROUP_AT(group_info, g));
-        put_group_info(group_info);
+        put_cred(cred);
        seq_printf(m, "\n");
 }
@@ -262,7 +263,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                blocked = p->blocked;
                collect_sigign_sigcatch(p, &ignored, &caught);
                num_threads = atomic_read(&p->signal->count);
-                qsize = atomic_read(&p->user->sigpending);
+                qsize = atomic_read(&__task_cred(p)->user->sigpending);
                qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
                unlock_task_sighand(p, &flags);
        }
@@ -293,10 +294,21 @@ static void render_cap_t(struct seq_file *m, const char *header,
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
-        render_cap_t(m, "CapInh:\t", &p->cap_inheritable);
+        const struct cred *cred;
-        render_cap_t(m, "CapPrm:\t", &p->cap_permitted);
+        kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
-        render_cap_t(m, "CapEff:\t", &p->cap_effective);
-        render_cap_t(m, "CapBnd:\t", &p->cap_bset);
+        rcu_read_lock();
+        cred = __task_cred(p);
+        cap_inheritable = cred->cap_inheritable;
+        cap_permitted   = cred->cap_permitted;
+        cap_effective   = cred->cap_effective;
+        cap_bset        = cred->cap_bset;
+        rcu_read_unlock();
+        render_cap_t(m, "CapInh:\t", &cap_inheritable);
+        render_cap_t(m, "CapPrm:\t", &cap_permitted);
+        render_cap_t(m, "CapEff:\t", &cap_effective);
+        render_cap_t(m, "CapBnd:\t", &cap_bset);
 }
 static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 486cf3fe713..cad92c1ac2b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -347,8 +347,8 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
        return sprintf(buffer, "%llu %llu %lu\n",
-                        task->sched_info.cpu_time,
+                        (unsigned long long)task->se.sum_exec_runtime,
-                        task->sched_info.run_delay,
+                        (unsigned long long)task->sched_info.run_delay,
                        task->sched_info.pcount);
 }
 #endif
@@ -371,7 +371,7 @@ static int lstats_show_proc(struct seq_file *m, void *v)
                                task->latency_record[i].time,
                                task->latency_record[i].max);
                        for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-                                char sym[KSYM_NAME_LEN];
+                                char sym[KSYM_SYMBOL_LEN];
                                char *c;
                                if (!task->latency_record[i].backtrace[q])
                                        break;
@@ -1406,6 +1406,7 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 {
        struct inode * inode;
        struct proc_inode *ei;
+        const struct cred *cred;
        /* We need a new inode */
@@ -1428,8 +1429,11 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        inode->i_uid = 0;
        inode->i_gid = 0;
        if (task_dumpable(task)) {
-                inode->i_uid = task->euid;
+                rcu_read_lock();
-                inode->i_gid = task->egid;
+                cred = __task_cred(task);
+                inode->i_uid = cred->euid;
+                inode->i_gid = cred->egid;
+                rcu_read_unlock();
        }
        security_task_to_inode(task, inode);
@@ -1445,6 +1449,8 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
 {
        struct inode *inode = dentry->d_inode;
        struct task_struct *task;
+        const struct cred *cred;
        generic_fillattr(inode, stat);
        rcu_read_lock();
@@ -1454,8 +1460,9 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
        if (task) {
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
-                        stat->uid = task->euid;
+                        cred = __task_cred(task);
-                        stat->gid = task->egid;
+                        stat->uid = cred->euid;
+                        stat->gid = cred->egid;
                }
        }
        rcu_read_unlock();
@@ -1483,11 +1490,16 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *inode = dentry->d_inode;
        struct task_struct *task = get_proc_task(inode);
+        const struct cred *cred;
        if (task) {
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
-                        inode->i_uid = task->euid;
+                        rcu_read_lock();
-                        inode->i_gid = task->egid;
+                        cred = __task_cred(task);
+                        inode->i_uid = cred->euid;
+                        inode->i_gid = cred->egid;
+                        rcu_read_unlock();
                } else {
                        inode->i_uid = 0;
                        inode->i_gid = 0;
@@ -1649,6 +1661,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct task_struct *task = get_proc_task(inode);
        int fd = proc_fd(inode);
        struct files_struct *files;
+        const struct cred *cred;
        if (task) {
                files = get_files_struct(task);
@@ -1658,8 +1671,11 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
                                rcu_read_unlock();
                                put_files_struct(files);
                                if (task_dumpable(task)) {
-                                        inode->i_uid = task->euid;
+                                        rcu_read_lock();
-                                        inode->i_gid = task->egid;
+                                        cred = __task_cred(task);
+                                        inode->i_uid = cred->euid;
+                                        inode->i_gid = cred->egid;
+                                        rcu_read_unlock();
                                } else {
                                        inode->i_uid = 0;
                                        inode->i_gid = 0;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d777789b7a8..de2bba5a344 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -218,8 +218,7 @@ void proc_device_tree_add_node(struct device_node *np,
 void __init proc_device_tree_init(void)
 {
        struct device_node *root;
-        if ( !have_of )
-                return;
        proc_device_tree = proc_mkdir("device-tree", NULL);
        if (proc_device_tree == 0)
                return;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81904f07679..f75efa22df5 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,6 +9,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/irqnr.h>
 #include <asm/cputime.h>
 #ifndef arch_irq_stat_cpu
@@ -44,10 +45,9 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+                for_each_irq_nr(j) {
-                for_each_irq_nr(j)
                        sum += kstat_irqs_cpu(j, i);
+                }
                sum += arch_irq_stat_cpu(i);
        }
        sum += arch_irq_stat();
@@ -92,7 +92,6 @@ static int show_stat(struct seq_file *p, void *v)
        /* sum again ? it could be updated? */
        for_each_irq_nr(j) {
                per_irq_sum = 0;
                for_each_possible_cpu(i)
                        per_irq_sum += kstat_irqs_cpu(j, i);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b770c095e45..3a8bdd7f575 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -557,9 +557,9 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
        return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
-static unsigned long pte_to_pagemap_entry(pte_t pte)
+static u64 pte_to_pagemap_entry(pte_t pte)
 {
-        unsigned long pme = 0;
+        u64 pme = 0;
        if (is_swap_pte(pte))
                pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
                        | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
diff --git a/fs/quota.c b/fs/quota.c
index 7f4386ebc23..b7fe44e0161 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -79,7 +79,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
        /* Check privileges */
        if (cmd == Q_GETQUOTA) {
-                if (((type == USRQUOTA && current->euid != id) ||
+                if (((type == USRQUOTA && current_euid() != id) ||
                     (type == GRPQUOTA && !in_egroup_p(id))) &&
                    !capable(CAP_SYS_ADMIN))
                        return -EPERM;
@@ -130,7 +130,7 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i
        /* Check privileges */
        if (cmd == Q_XGETQUOTA) {
-                if (((type == XQM_USRQUOTA && current->euid != id) ||
+                if (((type == XQM_USRQUOTA && current_euid() != id) ||
                     (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
                     !capable(CAP_SYS_ADMIN))
                        return -EPERM;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index f031d1c925f..a83a3518ae3 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -55,8 +55,8 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = current->fsuid;
+                inode->i_uid = current_fsuid();
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c69449..ed04f47007f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                       struct inode *inode)
 {
        struct super_block *sb;
+        struct reiserfs_iget_args args;
        INITIALIZE_PATH(path_to_key);
        struct cpu_key key;
        struct item_head ih;
@@ -1780,6 +1781,14 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                err = -ENOMEM;
                goto out_bad_inode;
        }
+        args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+        memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+        args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+        if (insert_inode_locked4(inode, args.objectid,
+                             reiserfs_find_actor, &args) < 0) {
+                err = -EINVAL;
+                goto out_bad_inode;
+        }
        if (old_format_only(sb))
                /* not a perfect generation count, as object ids can be reused, but 
                 ** this is as good as reiserfs can do right now.
@@ -1859,13 +1868,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        } else {
                inode2sd(&sd, inode, inode->i_size);
        }
-        // these do not go to on-disk stat data
-        inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
        // store in in-core inode the key of stat data and version all
        // object items will have (directory items will have old offset
        // format, other new objects will consist of new items)
-        memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
        if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
                set_inode_item_key_version(inode, KEY_FORMAT_3_5);
        else
@@ -1929,7 +1934,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                reiserfs_mark_inode_private(inode);
        }
-        insert_inode_hash(inode);
        reiserfs_update_sd(th, inode);
        reiserfs_check_path(&path_to_key);
@@ -1956,6 +1960,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
      out_inserted_sd:
        inode->i_nlink = 0;
        th->t_trans_id = 0;     /* so the caller can't use this handle later */
+        unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
        /* If we were inheriting an ACL, we need to release the lock so that
         * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
@@ -2556,7 +2561,7 @@ static int reiserfs_write_begin(struct file *file,
        }
        index = pos >> PAGE_CACHE_SHIFT;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index f89ebb943f3..738967f6c8e 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -573,7 +573,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
        /* the quota init calls have to know who to charge the quota to, so
         ** we have to set uid and gid here
         */
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        inode->i_mode = mode;
        /* Make inode invalid - just in case we are going to drop it before
         * the initialization happens */
@@ -584,7 +584,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
                if (S_ISDIR(mode))
                        inode->i_mode |= S_ISGID;
        } else {
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        }
        DQUOT_INIT(inode);
        return 0;
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
        reiserfs_update_inode_transaction(dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        reiserfs_update_sd(&th, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
        if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
                err = journal_end(&th, parent_dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
      out_failed:
        reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eba2eabcd2b..b569ff1c4dc 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)
 }
 EXPORT_SYMBOL(seq_printf);
-static char *mangle_path(char *s, char *p, char *esc)
+/**
+ *      mangle_path -   mangle and copy path to buffer beginning
+ *      @s: buffer start
+ *      @p: beginning of path in above buffer
+ *      @esc: set of characters that need escaping
+ *
+ *      Copy the path from @p to @s, replacing each occurrence of character from
+ *      @esc with usual octal escape.
+ *      Returns pointer past last written character in @s, or NULL in case of
+ *      failure.
+ */
+char *mangle_path(char *s, char *p, char *esc)
 {
        while (s <= p) {
                char c = *p++;
@@ -376,9 +387,16 @@ static char *mangle_path(char *s, char *p, char *esc)
        }
        return NULL;
 }
+EXPORT_SYMBOL(mangle_path);
-/*
+/**
- * return the absolute path of 'dentry' residing in mount 'mnt'.
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
 */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
@@ -450,7 +468,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
        return -1;
 }
-int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+                                   unsigned int nr_bits)
 {
        if (m->count < m->size) {
                int len = bitmap_scnprintf(m->buf + m->count,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 48da4fa6b7d..e7ddd0328dd 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -667,8 +667,7 @@ smb_make_node(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        attr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID;
        attr.ia_mode = mode;
-        attr.ia_uid = current->euid;
+        current_euid_egid(&attr.ia_uid, &attr.ia_gid);
-        attr.ia_gid = current->egid;
        if (!new_valid_dev(dev))
                return -EINVAL;
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a555..92d5e8ffb63 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 3528f40ffb0..fc27fbfc539 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -586,7 +586,7 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
                if (parse_options(mnt, raw_data))
                        goto out_bad_option;
        }
-        mnt->mounted_uid = current->uid;
+        mnt->mounted_uid = current_uid();
        smb_setcodepage(server, &mnt->codepage);
        /*
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index ee536e8a649..9468168b9af 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -864,7 +864,7 @@ smb_newconn(struct smb_sb_info *server, struct smb_conn_opt *opt)
                goto out;
        error = -EACCES;
-        if (current->uid != server->mnt->mounted_uid && 
+        if (current_uid() != server->mnt->mounted_uid &&
            !capable(CAP_SYS_ADMIN))
                goto out;
diff --git a/fs/super.c b/fs/super.c
index 400a7608f15..ddba069d7a9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -914,7 +914,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
                goto out_free_secdata;
        BUG_ON(!mnt->mnt_sb);
-        error = security_sb_kern_mount(mnt->mnt_sb, secdata);
+        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
        if (error)
                goto out_sb;
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 115ab0d6f4b..241e9765cfa 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -165,9 +165,9 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        inode->i_ino = fs16_to_cpu(sbi, ino);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        inode->i_blocks = 0;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435baa4..3d81bf58dae 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
                if (inode->i_blocks) {
                        inode->i_op = &sysv_symlink_inode_operations;
                        inode->i_mapping->a_ops = &sysv_aops;
-                } else
+                } else {
                        inode->i_op = &sysv_fast_symlink_inode_operations;
+                        nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+                                sizeof(SYSV_I(inode)->i_data) - 1);
+                }
        } else
                init_special_inode(inode, inode->i_mode, rdev);
 }
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1a4973e1066..0e5e54d8292 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -32,18 +32,15 @@
 #include "ubifs.h"
 #include <linux/writeback.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
 /*
 * When pessimistic budget calculations say that there is no enough space,
 * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
- * or committing. The below constants define maximum number of times UBIFS
+ * or committing. The below constant defines maximum number of times UBIFS
 * repeats the operations.
 */
-#define MAX_SHRINK_RETRIES 8
+#define MAX_MKSPC_RETRIES 3
-#define MAX_GC_RETRIES     4
-#define MAX_CMT_RETRIES    2
-#define MAX_NOSPC_RETRIES  1
 /*
 * The below constant defines amount of dirty pages which should be written
@@ -52,30 +49,6 @@
 #define NR_TO_WRITE 16
 /**
- * struct retries_info - information about re-tries while making free space.
- * @prev_liability: previous liability
- * @shrink_cnt: how many times the liability was shrinked
- * @shrink_retries: count of liability shrink re-tries (increased when
- *                  liability does not shrink)
- * @try_gc: GC should be tried first
- * @gc_retries: how many times GC was run
- * @cmt_retries: how many times commit has been done
- * @nospc_retries: how many times GC returned %-ENOSPC
- *
- * Since we consider budgeting to be the fast-path, and this structure has to
- * be allocated on stack and zeroed out, we make it smaller using bit-fields.
- */
-struct retries_info {
-        long long prev_liability;
-        unsigned int shrink_cnt;
-        unsigned int shrink_retries:5;
-        unsigned int try_gc:1;
-        unsigned int gc_retries:4;
-        unsigned int cmt_retries:3;
-        unsigned int nospc_retries:1;
-};
-/**
 * shrink_liability - write-back some dirty pages/inodes.
 * @c: UBIFS file-system description object
 * @nr_to_write: how many dirty pages to write-back
@@ -147,9 +120,25 @@ static int run_gc(struct ubifs_info *c)
 }
 /**
+ * get_liability - calculate current liability.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns current UBIFS liability, i.e. the
+ * amount of bytes UBIFS has "promised" to write to the media.
+ */
+static long long get_liability(struct ubifs_info *c)
+{
+        long long liab;
+        spin_lock(&c->space_lock);
+        liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+        spin_unlock(&c->space_lock);
+        return liab;
+}
+/**
 * make_free_space - make more free space on the file-system.
 * @c: UBIFS file-system description object
- * @ri: information about previous invocations of this function
 *
 * This function is called when an operation cannot be budgeted because there
 * is supposedly no free space. But in most cases there is some free space:
@@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c)
 * Returns %-ENOSPC if it couldn't do more free space, and other negative error
 * codes on failures.
 */
-static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+static int make_free_space(struct ubifs_info *c)
 {
-        int err;
+        int err, retries = 0;
+        long long liab1, liab2;
-        /*
-         * If we have some dirty pages and inodes (liability), try to write
-         * them back unless this was tried too many times without effect
-         * already.
-         */
-        if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
-                long long liability;
-                spin_lock(&c->space_lock);
-                liability = c->budg_idx_growth + c->budg_data_growth +
-                            c->budg_dd_growth;
-                spin_unlock(&c->space_lock);
-                if (ri->prev_liability >= liability) {
+        do {
-                        /* Liability does not shrink, next time try GC then */
+                liab1 = get_liability(c);
-                        ri->shrink_retries += 1;
+                /*
-                        if (ri->gc_retries < MAX_GC_RETRIES)
+                 * We probably have some dirty pages or inodes (liability), try
-                                ri->try_gc = 1;
+                 * to write them back.
-                        dbg_budg("liability did not shrink: retries %d of %d",
+                 */
-                                 ri->shrink_retries, MAX_SHRINK_RETRIES);
+                dbg_budg("liability %lld, run write-back", liab1);
-                }
+                shrink_liability(c, NR_TO_WRITE);
-                dbg_budg("force write-back (count %d)", ri->shrink_cnt);
+                liab2 = get_liability(c);
-                shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+                if (liab2 < liab1)
+                        return -EAGAIN;
-                ri->prev_liability = liability;
+                dbg_budg("new liability %lld (not shrinked)", liab2);
-                ri->shrink_cnt += 1;
-                return -EAGAIN;
-        }
-        /*
+                /* Liability did not shrink again, try GC */
-         * Try to run garbage collector unless it was already tried too many
+                dbg_budg("Run GC");
-         * times.
-         */
-        if (ri->gc_retries < MAX_GC_RETRIES) {
-                ri->gc_retries += 1;
-                dbg_budg("run GC, retries %d of %d",
-                         ri->gc_retries, MAX_GC_RETRIES);
-                ri->try_gc = 0;
                err = run_gc(c);
                if (!err)
                        return -EAGAIN;
-                if (err == -EAGAIN) {
+                if (err != -EAGAIN && err != -ENOSPC)
-                        dbg_budg("GC asked to commit");
+                        /* Some real error happened */
-                        err = ubifs_run_commit(c);
-                        if (err)
-                                return err;
-                        return -EAGAIN;
-                }
-                if (err != -ENOSPC)
-                        return err;
-                /*
-                 * GC could not make any progress. If this is the first time,
-                 * then it makes sense to try to commit, because it might make
-                 * some dirty space.
-                 */
-                dbg_budg("GC returned -ENOSPC, retries %d",
-                         ri->nospc_retries);
-                if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
                        return err;
-                ri->nospc_retries += 1;
-        }
-        /* Neither GC nor write-back helped, try to commit */
+                dbg_budg("Run commit (retries %d)", retries);
-        if (ri->cmt_retries < MAX_CMT_RETRIES) {
-                ri->cmt_retries += 1;
-                dbg_budg("run commit, retries %d of %d",
-                         ri->cmt_retries, MAX_CMT_RETRIES);
                err = ubifs_run_commit(c);
                if (err)
                        return err;
-                return -EAGAIN;
+        } while (retries++ < MAX_MKSPC_RETRIES);
-        }
        return -ENOSPC;
 }
@@ -258,8 +202,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
 */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-        int ret;
+        int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
-        uint64_t idx_size;
+        long long idx_size;
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
@@ -271,23 +215,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
         * pair, nor similarly the two variables for the new index size, so we
         * have to do this costly 64-bit division on fast-path.
         */
-        if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
+        idx_size += eff_leb_size - 1;
-                ret = idx_size + 1;
+        idx_lebs = div_u64(idx_size, eff_leb_size);
-        else
-                ret = idx_size;
        /*
         * The index head is not available for the in-the-gaps method, so add an
         * extra LEB to compensate.
         */
-        ret += 1;
+        idx_lebs += 1;
-        /*
+        if (idx_lebs < MIN_INDEX_LEBS)
-         * At present the index needs at least 2 LEBs: one for the index head
+                idx_lebs = MIN_INDEX_LEBS;
-         * and one for in-the-gaps method (which currently does not cater for
+        return idx_lebs;
-         * the index head and so excludes it from consideration).
-         */
-        if (ret < 2)
-                ret = 2;
-        return ret;
 }
 /**
@@ -363,7 +300,7 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
 */
 static int can_use_rp(struct ubifs_info *c)
 {
-        if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
+        if (current_fsuid() == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
            (c->rp_gid != 0 && in_group_p(c->rp_gid)))
                return 1;
        return 0;
@@ -530,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
 int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
        int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
-        int err, idx_growth, data_growth, dd_growth;
+        int err, idx_growth, data_growth, dd_growth, retried = 0;
-        struct retries_info ri;
        ubifs_assert(req->new_page <= 1);
        ubifs_assert(req->dirtied_page <= 1);
@@ -549,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
        if (!data_growth && !dd_growth)
                return 0;
        idx_growth = calc_idx_growth(c, req);
-        memset(&ri, 0, sizeof(struct retries_info));
 again:
        spin_lock(&c->space_lock);
@@ -587,12 +522,17 @@ again:
                return err;
        }
-        err = make_free_space(c, &ri);
+        err = make_free_space(c);
+        cond_resched();
        if (err == -EAGAIN) {
                dbg_budg("try again");
-                cond_resched();
                goto again;
        } else if (err == -ENOSPC) {
+                if (!retried) {
+                        retried = 1;
+                        dbg_budg("-ENOSPC, but anyway try once again");
+                        goto again;
+                }
                dbg_budg("FS is full, -ENOSPC");
                c->nospace = 1;
                if (can_use_rp(c) || c->rp_size == 0)
@@ -712,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 * user-space. User-space application tend to expect that if the file-system
 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
 * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
+ * node and it has to write indexing nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * overhead, and UBIFS has to report slightly less free space to meet the above
- * above expectetion.
+ * expectations.
 *
 * This function assumes free space is made up of uncompressed data nodes and
 * full index nodes (one per data node, tripled because we always allow enough
@@ -723,7 +663,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 * Note, the calculation is pessimistic, which means that most of the time
 * UBIFS reports less space than it actually has.
 */
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 {
        int divisor, factor, f;
@@ -737,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
         * of data nodes, f - fanout. Because effective UBIFS fanout is twice
         * as less than maximum fanout, we assume that each data node
         * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
-         * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+         * Note, the multiplier 3 is because UBIFS reserves thrice as more space
         * for the index.
         */
        f = c->fanout > 3 ? c->fanout >> 1 : 2;
@@ -745,8 +685,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
        divisor = UBIFS_MAX_DATA_NODE_SZ;
        divisor += (c->max_idx_node_sz * 3) / (f - 1);
        free *= factor;
-        do_div(free, divisor);
+        return div_u64(free, divisor);
-        return free;
 }
 /**
@@ -756,10 +695,10 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
 * This function calculates amount of free space to report to user-space.
 *
 * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of eraseblocks, etc), it cannot report real
 * amount of free flash space it has (well, because not all dirty space is
- * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectetion about what free space is. Users seem to
+ * it would bread user expectations about what free space is. Users seem to
 * accustomed to assume that if the file-system reports N bytes of free space,
 * they would be able to fit a file of N bytes to the FS. This almost works for
 * traditional file-systems, because they have way less overhead than UBIFS.
@@ -771,18 +710,9 @@ long long ubifs_get_free_space(struct ubifs_info *c)
        long long available, outstanding, free;
        spin_lock(&c->space_lock);
-        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        min_idx_lebs = c->min_idx_lebs;
+        ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        /*
-         * Force the amount available to the total size reported if the used
-         * space is zero.
-         */
-        if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
-                spin_unlock(&c->space_lock);
-                return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
-        }
        available = ubifs_calc_available(c, min_idx_lebs);
        /*
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b49884c8c10..f3a7945527f 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
        struct ubifs_idx_node *idx;
        int lnum, offs, len, err = 0;
+        struct ubifs_debug_info *d = c->dbg;
-        c->old_zroot = *zroot;
+        d->old_zroot = *zroot;
+        lnum = d->old_zroot.lnum;
-        lnum = c->old_zroot.lnum;
+        offs = d->old_zroot.offs;
-        offs = c->old_zroot.offs;
+        len = d->old_zroot.len;
-        len = c->old_zroot.len;
        idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
        if (!idx)
@@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
        if (err)
                goto out;
-        c->old_zroot_level = le16_to_cpu(idx->level);
+        d->old_zroot_level = le16_to_cpu(idx->level);
-        c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+        d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
 out:
        kfree(idx);
        return err;
@@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
        int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
        int first = 1, iip;
+        struct ubifs_debug_info *d = c->dbg;
        union ubifs_key lower_key, upper_key, l_key, u_key;
        unsigned long long uninitialized_var(last_sqnum);
        struct ubifs_idx_node *idx;
@@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
             UBIFS_IDX_NODE_SZ;
        /* Start at the old zroot */
-        lnum = c->old_zroot.lnum;
+        lnum = d->old_zroot.lnum;
-        offs = c->old_zroot.offs;
+        offs = d->old_zroot.offs;
-        len = c->old_zroot.len;
+        len = d->old_zroot.len;
        iip = 0;
        /*
@@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
                if (first) {
                        first = 0;
                        /* Check root level and sqnum */
-                        if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+                        if (le16_to_cpu(idx->level) != d->old_zroot_level) {
                                err = 2;
                                goto out_dump;
                        }
-                        if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+                        if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
                                err = 3;
                                goto out_dump;
                        }
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index a0ada596b17..11e4132f314 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -33,7 +33,7 @@
 /* Fake description object for the "none" compressor */
 static struct ubifs_compressor none_compr = {
        .compr_type = UBIFS_COMPR_NONE,
-        .name = "no compression",
+        .name = "none",
        .capi_name = "",
 };
@@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex);
 static struct ubifs_compressor lzo_compr = {
        .compr_type = UBIFS_COMPR_LZO,
        .comp_mutex = &lzo_mutex,
-        .name = "LZO",
+        .name = "lzo",
        .capi_name = "lzo",
 };
 #else
 static struct ubifs_compressor lzo_compr = {
        .compr_type = UBIFS_COMPR_LZO,
-        .name = "LZO",
+        .name = "lzo",
 };
 #endif
@@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
        if (compr->comp_mutex)
                mutex_lock(compr->comp_mutex);
        err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
-                                   out_len);
+                                   (unsigned int *)out_len);
        if (compr->comp_mutex)
                mutex_unlock(compr->comp_mutex);
        if (unlikely(err)) {
@@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
        }
        /*
-         * Presently, we just require that compression results in less data,
+         * If the data compressed only slightly, it is better to leave it
-         * rather than any defined minimum compression ratio or amount.
+         * uncompressed to improve read speed.
         */
-        if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+        if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
                goto no_compr;
        return;
@@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
        if (compr->decomp_mutex)
                mutex_lock(compr->decomp_mutex);
        err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
-                                     out_len);
+                                     (unsigned int *)out_len);
        if (compr->decomp_mutex)
                mutex_unlock(compr->decomp_mutex);
        if (err)
@@ -244,7 +244,7 @@ out_lzo:
 /**
 * ubifs_compressors_exit - de-initialize UBIFS compressors.
 */
-void __exit ubifs_compressors_exit(void)
+void ubifs_compressors_exit(void)
 {
        compr_exit(&lzo_compr);
        compr_exit(&zlib_compr);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 510ffa0bbda..792c5a16c18 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,8 @@
 #include "ubifs.h"
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/math64.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -596,7 +598,9 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct rb_node *rb;
        struct ubifs_bud *bud;
        struct ubifs_gced_idx_leb *idx_gc;
+        long long available, outstanding, free;
+        ubifs_assert(spin_is_locked(&c->space_lock));
        spin_lock(&dbg_lock);
        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
@@ -629,6 +633,17 @@ void dbg_dump_budg(struct ubifs_info *c)
                printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
                       idx_gc->lnum, idx_gc->unmap);
        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+        /* Print budgeting predictions */
+        available = ubifs_calc_available(c, c->min_idx_lebs);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        if (available > outstanding)
+                free = ubifs_reported_space(c, available - outstanding);
+        else
+                free = 0;
+        printk(KERN_DEBUG "Budgeting predictions:\n");
+        printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+               available, outstanding, free);
        spin_unlock(&dbg_lock);
 }
@@ -645,7 +660,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
        struct ubifs_lprops lp;
        struct ubifs_lp_stats lst;
-        printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+               current->pid);
        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
@@ -656,6 +672,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
                dbg_dump_lprop(c, &lp);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+               current->pid);
 }
 void dbg_dump_lpt_info(struct ubifs_info *c)
@@ -663,6 +681,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        int i;
        spin_lock(&dbg_lock);
+        printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
        printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
        printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
        printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
@@ -684,7 +703,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
        printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
               c->nhead_lnum, c->nhead_offs);
-        printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+        printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+               c->ltab_lnum, c->ltab_offs);
        if (c->big_lpt)
                printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
                       c->lsave_lnum, c->lsave_offs);
@@ -703,9 +723,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        if (dbg_failure_mode)
                return;
-        printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
+        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+               current->pid, lnum);
-        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
        if (IS_ERR(sleb)) {
                ubifs_err("scan error %d", (int)PTR_ERR(sleb));
                return;
@@ -721,6 +741,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
                dbg_dump_node(c, snod->node);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+               current->pid, lnum);
        ubifs_scan_destroy(sleb);
        return;
 }
@@ -768,7 +790,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
+        printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
               current->pid, cat, heap->cnt);
        for (i = 0; i < heap->cnt; i++) {
                struct ubifs_lprops *lprops = heap->arr[i];
@@ -777,6 +799,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
                       "flags %d\n", i, lprops->lnum, lprops->hpos,
                       lprops->free, lprops->dirty, lprops->flags);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
 }
 void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -784,7 +807,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -803,7 +826,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
        int level;
        printk(KERN_DEBUG "\n");
-        printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
        level = znode->level;
        printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -815,8 +838,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
                dbg_dump_znode(c, znode);
                znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
-        printk(KERN_DEBUG "\n");
 }
 static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -992,8 +1014,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                        zbr1->offs, DBGKEY(&key));
                dbg_err("but it should have key %s according to tnc",
                        DBGKEY(&zbr1->key));
-                        dbg_dump_node(c, dent1);
+                dbg_dump_node(c, dent1);
-                        goto out_free;
+                goto out_free;
        }
        key_read(c, &dent2->key, &key);
@@ -1002,8 +1024,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                        zbr1->offs, DBGKEY(&key));
                dbg_err("but it should have key %s according to tnc",
                        DBGKEY(&zbr2->key));
-                        dbg_dump_node(c, dent2);
+                dbg_dump_node(c, dent2);
-                        goto out_free;
+                goto out_free;
        }
        nlen1 = le16_to_cpu(dent1->nlen);
@@ -1020,9 +1042,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                dbg_err("bad order of colliding key %s",
                        DBGKEY(&key));
-        dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+        ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
        dbg_dump_node(c, dent1);
-        dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+        ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
        dbg_dump_node(c, dent2);
 out_free:
@@ -2097,13 +2119,13 @@ static int simple_rand(void)
        return (next >> 16) & 32767;
 }
-void dbg_failure_mode_registration(struct ubifs_info *c)
+static void failure_mode_init(struct ubifs_info *c)
 {
        struct failure_mode_info *fmi;
        fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
        if (!fmi) {
-                dbg_err("Failed to register failure mode - no memory");
+                ubifs_err("Failed to register failure mode - no memory");
                return;
        }
        fmi->c = c;
@@ -2112,7 +2134,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c)
        spin_unlock(&fmi_lock);
 }
-void dbg_failure_mode_deregistration(struct ubifs_info *c)
+static void failure_mode_exit(struct ubifs_info *c)
 {
        struct failure_mode_info *fmi, *tmp;
@@ -2146,42 +2168,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc)
        struct ubifs_info *c = dbg_find_info(desc);
        if (c && dbg_failure_mode)
-                return c->failure_mode;
+                return c->dbg->failure_mode;
        return 0;
 }
 static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
 {
        struct ubifs_info *c = dbg_find_info(desc);
+        struct ubifs_debug_info *d;
        if (!c || !dbg_failure_mode)
                return 0;
-        if (c->failure_mode)
+        d = c->dbg;
+        if (d->failure_mode)
                return 1;
-        if (!c->fail_cnt) {
+        if (!d->fail_cnt) {
                /* First call - decide delay to failure */
                if (chance(1, 2)) {
                        unsigned int delay = 1 << (simple_rand() >> 11);
                        if (chance(1, 2)) {
-                                c->fail_delay = 1;
+                                d->fail_delay = 1;
-                                c->fail_timeout = jiffies +
+                                d->fail_timeout = jiffies +
                                                  msecs_to_jiffies(delay);
                                dbg_rcvry("failing after %ums", delay);
                        } else {
-                                c->fail_delay = 2;
+                                d->fail_delay = 2;
-                                c->fail_cnt_max = delay;
+                                d->fail_cnt_max = delay;
                                dbg_rcvry("failing after %u calls", delay);
                        }
                }
-                c->fail_cnt += 1;
+                d->fail_cnt += 1;
        }
        /* Determine if failure delay has expired */
-        if (c->fail_delay == 1) {
+        if (d->fail_delay == 1) {
-                if (time_before(jiffies, c->fail_timeout))
+                if (time_before(jiffies, d->fail_timeout))
                        return 0;
-        } else if (c->fail_delay == 2)
+        } else if (d->fail_delay == 2)
-                if (c->fail_cnt++ < c->fail_cnt_max)
+                if (d->fail_cnt++ < d->fail_cnt_max)
                        return 0;
        if (lnum == UBIFS_SB_LNUM) {
                if (write) {
@@ -2239,7 +2263,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
                dbg_rcvry("failing in bud LEB %d commit not running", lnum);
        }
        ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
-        c->failure_mode = 1;
+        d->failure_mode = 1;
        dump_stack();
        return 1;
 }
@@ -2344,4 +2368,181 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
        return 0;
 }
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+        c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+        if (!c->dbg)
+                return -ENOMEM;
+        c->dbg->buf = vmalloc(c->leb_size);
+        if (!c->dbg->buf)
+                goto out;
+        failure_mode_init(c);
+        return 0;
+out:
+        kfree(c->dbg);
+        return -ENOMEM;
+}
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+        failure_mode_exit(c);
+        vfree(c->dbg->buf);
+        kfree(c->dbg);
+}
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *debugfs_rootdir;
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+        debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
+        if (IS_ERR(debugfs_rootdir)) {
+                int err = PTR_ERR(debugfs_rootdir);
+                ubifs_err("cannot create \"ubifs\" debugfs directory, "
+                          "error %d\n", err);
+                return err;
+        }
+        return 0;
+}
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+        debugfs_remove(debugfs_rootdir);
+}
+static int open_debugfs_file(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+        struct ubifs_info *c = file->private_data;
+        struct ubifs_debug_info *d = c->dbg;
+        if (file->f_path.dentry == d->dump_lprops)
+                dbg_dump_lprops(c);
+        else if (file->f_path.dentry == d->dump_budg) {
+                spin_lock(&c->space_lock);
+                dbg_dump_budg(c);
+                spin_unlock(&c->space_lock);
+        } else if (file->f_path.dentry == d->dump_tnc) {
+                mutex_lock(&c->tnc_mutex);
+                dbg_dump_tnc(c);
+                mutex_unlock(&c->tnc_mutex);
+        } else
+                return -EINVAL;
+        *ppos += count;
+        return count;
+}
+static const struct file_operations debugfs_fops = {
+        .open = open_debugfs_file,
+        .write = write_debugfs_file,
+        .owner = THIS_MODULE,
+};
+/**
+ * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates all debugfs files for this instance of UBIFS. Returns
+ * zero in case of success and a negative error code in case of failure.
+ *
+ * Note, the only reason we have not merged this function with the
+ * 'ubifs_debugging_init()' function is because it is better to initialize
+ * debugfs interfaces at the very end of the mount process, and remove them at
+ * the very beginning of the mount process.
+ */
+int dbg_debugfs_init_fs(struct ubifs_info *c)
+{
+        int err;
+        const char *fname;
+        struct dentry *dent;
+        struct ubifs_debug_info *d = c->dbg;
+        sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+        d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
+                                              debugfs_rootdir);
+        if (IS_ERR(d->debugfs_dir)) {
+                err = PTR_ERR(d->debugfs_dir);
+                ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+                          d->debugfs_dir_name, err);
+                goto out;
+        }
+        fname = "dump_lprops";
+        dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+                                   &debugfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dump_lprops = dent;
+        fname = "dump_budg";
+        dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+                                   &debugfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dump_budg = dent;
+        fname = "dump_tnc";
+        dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+                                   &debugfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dump_tnc = dent;
+        return 0;
+out_remove:
+        err = PTR_ERR(dent);
+        ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+                  fname, err);
+        debugfs_remove_recursive(d->debugfs_dir);
+out:
+        return err;
+}
+/**
+ * dbg_debugfs_exit_fs - remove all debugfs files.
+ * @c: UBIFS file-system description object
+ */
+void dbg_debugfs_exit_fs(struct ubifs_info *c)
+{
+        debugfs_remove_recursive(c->dbg->debugfs_dir);
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 33d6b95071e..9820d6999f7 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -25,7 +25,56 @@
 #ifdef CONFIG_UBIFS_FS_DEBUG
-#define UBIFS_DBG(op) op
+/**
+ * ubifs_debug_info - per-FS debugging information.
+ * @buf: a buffer of LEB size, used for various purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ * @chk_lpt_sz: used by LPT tree size checker
+ * @chk_lpt_sz2: used by LPT tree size checker
+ * @chk_lpt_wastage: used by LPT tree size checker
+ * @chk_lpt_lebs: used by LPT tree size checker
+ * @new_nhead_offs: used by LPT tree size checker
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * debugfs_dir_name: name of debugfs directory containing this file-system's
+ *                   files
+ * debugfs_dir: direntry object of the file-system debugfs directory
+ * dump_lprops: "dump lprops" debugfs knob
+ * dump_budg: "dump budgeting information" debugfs knob
+ * dump_tnc: "dump TNC" debugfs knob
+ */
+struct ubifs_debug_info {
+        void *buf;
+        struct ubifs_zbranch old_zroot;
+        int old_zroot_level;
+        unsigned long long old_zroot_sqnum;
+        int failure_mode;
+        int fail_delay;
+        unsigned long fail_timeout;
+        unsigned int fail_cnt;
+        unsigned int fail_cnt_max;
+        long long chk_lpt_sz;
+        long long chk_lpt_sz2;
+        long long chk_lpt_wastage;
+        int chk_lpt_lebs;
+        int new_nhead_offs;
+        int new_ihead_lnum;
+        int new_ihead_offs;
+        char debugfs_dir_name[100];
+        struct dentry *debugfs_dir;
+        struct dentry *dump_lprops;
+        struct dentry *dump_budg;
+        struct dentry *dump_tnc;
+};
 #define ubifs_assert(expr) do {                                                \
        if (unlikely(!(expr))) {                                               \
@@ -211,14 +260,18 @@ extern unsigned int ubifs_msg_flags;
 extern unsigned int ubifs_chk_flags;
 extern unsigned int ubifs_tst_flags;
-/* Dump functions */
+int ubifs_debugging_init(struct ubifs_info *c);
+void ubifs_debugging_exit(struct ubifs_info *c);
+/* Dump functions */
 const char *dbg_ntype(int type);
 const char *dbg_cstate(int cmt_state);
 const char *dbg_get_key_dump(const struct ubifs_info *c,
                             const union ubifs_key *key);
 void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
 void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
+                       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
 void dbg_dump_budg(struct ubifs_info *c);
@@ -233,9 +286,9 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
                    struct ubifs_nnode *parent, int iip);
 void dbg_dump_tnc(struct ubifs_info *c);
 void dbg_dump_index(struct ubifs_info *c);
+void dbg_dump_lpt_lebs(const struct ubifs_info *c);
 /* Checking helper functions */
 typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
                                 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
@@ -274,9 +327,6 @@ int dbg_force_in_the_gaps(void);
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
-void dbg_failure_mode_registration(struct ubifs_info *c);
-void dbg_failure_mode_deregistration(struct ubifs_info *c);
 #ifndef UBIFS_DBG_PRESERVE_UBI
 #define ubi_leb_read   dbg_leb_read
@@ -318,9 +368,13 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
        return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
 }
-#else /* !CONFIG_UBIFS_FS_DEBUG */
+/* Debugfs-related stuff */
+int dbg_debugfs_init(void);
+void dbg_debugfs_exit(void);
+int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_exit_fs(struct ubifs_info *c);
-#define UBIFS_DBG(op)
+#else /* !CONFIG_UBIFS_FS_DEBUG */
 /* Use "if (0)" to make compiler check arguments even if debugging is off */
 #define ubifs_assert(expr)  do {                                               \
@@ -360,23 +414,28 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
-#define dbg_ntype(type)                       ""
+#define ubifs_debugging_init(c)                0
-#define dbg_cstate(cmt_state)                 ""
+#define ubifs_debugging_exit(c)                ({})
-#define dbg_get_key_dump(c, key)              ({})
-#define dbg_dump_inode(c, inode)              ({})
+#define dbg_ntype(type)                        ""
-#define dbg_dump_node(c, node)                ({})
+#define dbg_cstate(cmt_state)                  ""
-#define dbg_dump_budget_req(req)              ({})
+#define dbg_get_key_dump(c, key)               ({})
-#define dbg_dump_lstats(lst)                  ({})
+#define dbg_dump_inode(c, inode)               ({})
-#define dbg_dump_budg(c)                      ({})
+#define dbg_dump_node(c, node)                 ({})
-#define dbg_dump_lprop(c, lp)                 ({})
+#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
-#define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_budget_req(req)               ({})
-#define dbg_dump_lpt_info(c)                  ({})
+#define dbg_dump_lstats(lst)                   ({})
-#define dbg_dump_leb(c, lnum)                 ({})
+#define dbg_dump_budg(c)                       ({})
-#define dbg_dump_znode(c, znode)              ({})
+#define dbg_dump_lprop(c, lp)                  ({})
-#define dbg_dump_heap(c, heap, cat)           ({})
+#define dbg_dump_lprops(c)                     ({})
-#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_lpt_info(c)                   ({})
-#define dbg_dump_tnc(c)                       ({})
+#define dbg_dump_leb(c, lnum)                  ({})
-#define dbg_dump_index(c)                     ({})
+#define dbg_dump_znode(c, znode)               ({})
+#define dbg_dump_heap(c, heap, cat)            ({})
+#define dbg_dump_pnode(c, pnode, parent, iip)  ({})
+#define dbg_dump_tnc(c)                        ({})
+#define dbg_dump_index(c)                      ({})
+#define dbg_dump_lpt_lebs(c)                   ({})
 #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
@@ -396,9 +455,11 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
-#define dbg_failure_mode_registration(c)           ({})
-#define dbg_failure_mode_deregistration(c)         ({})
-#endif /* !CONFIG_UBIFS_FS_DEBUG */
+#define dbg_debugfs_init()                         0
+#define dbg_debugfs_exit()
+#define dbg_debugfs_init_fs(c)                     0
+#define dbg_debugfs_exit_fs(c)                     0
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
 #endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 0422c98e179..f448ab1f9c3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,13 +104,13 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
         */
        inode->i_flags |= (S_NOCMTIME);
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        inode->i_mode = mode;
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                         ubifs_current_time(inode);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2624411d975..bf37374567f 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
                return err;
        }
-        ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum);
+        ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+                     ubifs_inode(inode)->creat_sqnum);
        len = le32_to_cpu(dn->size);
        if (len <= 0 || len > UBIFS_BLOCK_SIZE)
                goto dump;
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
 }
 static int write_begin_slow(struct address_space *mapping,
-                            loff_t pos, unsigned len, struct page **pagep)
+                            loff_t pos, unsigned len, struct page **pagep,
+                            unsigned flags)
 {
        struct inode *inode = mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping,
        if (unlikely(err))
                return err;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page)) {
                ubifs_release_budget(c, &req);
                return -ENOMEM;
        }
        if (!PageUptodate(page)) {
-                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
                        SetPageChecked(page);
                else {
                        err = do_readpage(page);
@@ -438,13 +439,13 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                return -EROFS;
        /* Try out the fast-path part first */
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page))
                return -ENOMEM;
        if (!PageUptodate(page)) {
                /* The page is not loaded from the flash */
-                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
                        /*
                         * We change whole page so no need to load it. But we
                         * have to set the @PG_checked flag to make the further
@@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
-                return write_begin_slow(mapping, pos, len, pagep);
+                return write_begin_slow(mapping, pos, len, pagep, flags);
        }
        /*
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5e82cffe969..6db7a6be6c9 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case FS_IOC_GETFLAGS:
                flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+                dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
                return put_user(flags, (int __user *) arg);
        case FS_IOC_SETFLAGS: {
@@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                err = mnt_want_write(file->f_path.mnt);
                if (err)
                        return err;
+                dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
                err = setflags(inode, flags);
                mnt_drop_write(file->f_path.mnt);
                return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f91b745908e..10ae25b7d1d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
        data->size = cpu_to_le32(len);
        zero_data_node_unused(data);
-        if (!(ui->flags && UBIFS_COMPR_FL))
+        if (!(ui->flags & UBIFS_COMPR_FL))
                /* Compression is disabled for this inode */
                compr_type = UBIFS_COMPR_NONE;
        else
@@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
        data_key_init(c, &key, inum, blk);
        bit = old_size & (UBIFS_BLOCK_SIZE - 1);
-        blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+        blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
        data_key_init(c, &to_key, inum, blk);
        err = ubifs_tnc_remove_range(c, &key, &to_key);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 3f1f16bc25c..efb3430a258 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -38,6 +38,22 @@
 #define __UBIFS_KEY_H__
 /**
+ * key_mask_hash - mask a valid hash value.
+ * @val: value to be masked
+ *
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
+ * function makes sure the reserved values are not used.
+ */
+static inline uint32_t key_mask_hash(uint32_t hash)
+{
+        hash &= UBIFS_S_KEY_HASH_MASK;
+        if (unlikely(hash <= 2))
+                hash += 3;
+        return hash;
+}
+/**
 * key_r5_hash - R5 hash function (borrowed from reiserfs).
 * @s: direntry name
 * @len: name length
@@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
                str++;
        }
-        a &= UBIFS_S_KEY_HASH_MASK;
+        return key_mask_hash(a);
-        /*
-         * We use hash values as offset in directories, so values %0 and %1 are
-         * reserved for "." and "..". %2 is reserved for "end of readdir"
-         * marker.
-         */
-        if (unlikely(a >= 0 && a <= 2))
-                a += 3;
-        return a;
 }
 /**
@@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len)
        len = min_t(uint32_t, len, 4);
        memcpy(&a, str, len);
-        a &= UBIFS_S_KEY_HASH_MASK;
+        return key_mask_hash(a);
-        if (unlikely(a >= 0 && a <= 2))
-                a += 3;
-        return a;
 }
 /**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f27176e9b70..dfd2bcece27 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
 * @flags: new flags
 * @idx_gc_cnt: change to the count of idx_gc list
 *
- * This function changes LEB properties. This function does not change a LEB
+ * This function changes LEB properties (@free, @dirty or @flag). However, the
- * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ * property which has the %LPROPS_NC value is not changed. Returns a pointer to
+ * the updated LEB properties on success and a negative error code on failure.
 *
- * This function returns a pointer to the updated LEB properties on success
+ * Note, the LEB properties may have had to be copied (due to COW) and
- * and a negative error code on failure. N.B. the LEB properties may have had to
+ * consequently the pointer returned may not be the same as the pointer
- * be copied (due to COW) and consequently the pointer returned may not be the
+ * passed.
- * same as the pointer passed.
 */
 const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
                                           const struct ubifs_lprops *lp,
@@ -1088,7 +1088,7 @@ static int scan_check_cb(struct ubifs_info *c,
                }
        }
-        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
        if (IS_ERR(sleb)) {
                /*
                 * After an unclean unmount, empty and freeable LEBs
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index db8bd0e518b..b2792e84d24 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -36,15 +36,16 @@
 * can be written into a single eraseblock. In that case, garbage collection
 * consists of just writing the whole table, which therefore makes all other
 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
- * selected for garbage collection, which consists are marking the nodes in
+ * selected for garbage collection, which consists of marking the clean nodes in
 * that LEB as dirty, and then only the dirty nodes are written out. Also, in
 * the case of the big model, a table of LEB numbers is saved so that the entire
 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
 * mounted.
 */
-#include <linux/crc16.h>
 #include "ubifs.h"
+#include <linux/crc16.h>
+#include <linux/math64.h>
 /**
 * do_calc_lpt_geom - calculate sizes for the LPT area.
@@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
 int ubifs_calc_lpt_geom(struct ubifs_info *c)
 {
        int lebs_needed;
-        uint64_t sz;
+        long long sz;
        do_calc_lpt_geom(c);
        /* Verify that lpt_lebs is big enough */
        sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
-        sz += c->leb_size - 1;
+        lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
-        do_div(sz, c->leb_size);
-        lebs_needed = sz;
        if (lebs_needed > c->lpt_lebs) {
                ubifs_err("too few LPT LEBs");
                return -EINVAL;
@@ -156,7 +155,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
        }
        c->check_lpt_free = c->big_lpt;
        return 0;
 }
@@ -176,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
                              int *big_lpt)
 {
        int i, lebs_needed;
-        uint64_t sz;
+        long long sz;
        /* Start by assuming the minimum number of LPT LEBs */
        c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
@@ -203,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
        /* Now check there are enough LPT LEBs */
        for (i = 0; i < 64 ; i++) {
                sz = c->lpt_sz * 4; /* Allow 4 times the size */
-                sz += c->leb_size - 1;
+                lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
-                do_div(sz, c->leb_size);
-                lebs_needed = sz;
                if (lebs_needed > c->lpt_lebs) {
                        /* Not enough LPT LEBs so try again with more */
                        c->lpt_lebs = lebs_needed;
@@ -558,7 +554,7 @@ static int calc_nnode_num(int row, int col)
 * This function calculates and returns the nnode number based on the parent's
 * nnode number and the index in parent.
 */
-static int calc_nnode_num_from_parent(struct ubifs_info *c,
+static int calc_nnode_num_from_parent(const struct ubifs_info *c,
                                      struct ubifs_nnode *parent, int iip)
 {
        int num, shft;
@@ -583,7 +579,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c,
 * This function calculates and returns the pnode number based on the parent's
 * nnode number and the index in parent.
 */
-static int calc_pnode_num_from_parent(struct ubifs_info *c,
+static int calc_pnode_num_from_parent(const struct ubifs_info *c,
                                      struct ubifs_nnode *parent, int iip)
 {
        int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
@@ -966,7 +962,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_pnode(struct ubifs_info *c, void *buf,
+static int unpack_pnode(const struct ubifs_info *c, void *buf,
                        struct ubifs_pnode *pnode)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
@@ -996,15 +992,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf,
 }
 /**
- * unpack_nnode - unpack a nnode.
+ * ubifs_unpack_nnode - unpack a nnode.
 * @c: UBIFS file-system description object
 * @buf: buffer containing packed nnode to unpack
 * @nnode: nnode structure to fill
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_nnode(struct ubifs_info *c, void *buf,
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
-                        struct ubifs_nnode *nnode)
+                       struct ubifs_nnode *nnode)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1036,7 +1032,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf,
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_ltab(struct ubifs_info *c, void *buf)
+static int unpack_ltab(const struct ubifs_info *c, void *buf)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1068,7 +1064,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_lsave(struct ubifs_info *c, void *buf)
+static int unpack_lsave(const struct ubifs_info *c, void *buf)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1096,7 +1092,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
                          struct ubifs_nnode *parent, int iip)
 {
        int i, lvl, max_offs;
@@ -1140,7 +1136,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
                          struct ubifs_nnode *parent, int iip)
 {
        int i;
@@ -1174,7 +1170,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 * This function calculates the LEB numbers for the LEB properties it contains
 * based on the pnode number.
 */
-static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+static void set_pnode_lnum(const struct ubifs_info *c,
+                           struct ubifs_pnode *pnode)
 {
        int i, lnum;
@@ -1227,7 +1224,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
                err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
                if (err)
                        goto out;
-                err = unpack_nnode(c, buf, nnode);
+                err = ubifs_unpack_nnode(c, buf, nnode);
                if (err)
                        goto out;
        }
@@ -1816,7 +1813,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
                               c->nnode_sz);
                if (err)
                        return ERR_PTR(err);
-                err = unpack_nnode(c, buf, nnode);
+                err = ubifs_unpack_nnode(c, buf, nnode);
                if (err)
                        return ERR_PTR(err);
        }
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index a41434b4278..96ca9570717 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,8 @@ no_space:
        dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
                "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
        dbg_dump_lpt_info(c);
+        dbg_dump_lpt_lebs(c);
+        dump_stack();
        return err;
 }
@@ -546,8 +548,10 @@ static int write_cnodes(struct ubifs_info *c)
 no_space:
        ubifs_err("LPT out of space mismatch");
        dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
-                "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+                "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
        dbg_dump_lpt_info(c);
+        dbg_dump_lpt_lebs(c);
+        dump_stack();
        return err;
 }
@@ -749,7 +753,7 @@ static void lpt_tgc_start(struct ubifs_info *c)
 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
 * free space and so may be reused as soon as the next commit is completed.
 * This function is called after the commit is completed (master node has been
- * written) and unmaps LPT LEBs that were marked for trivial GC.
+ * written) and un-maps LPT LEBs that were marked for trivial GC.
 */
 static int lpt_tgc_end(struct ubifs_info *c)
 {
@@ -1025,7 +1029,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
 * @c: UBIFS file-system description object
 * @node_type: LPT node type
 */
-static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
 {
        switch (node_type) {
        case UBIFS_LPT_NNODE:
@@ -1046,7 +1050,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type)
 * @buf: buffer
 * @len: length of buffer
 */
-static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
 {
        int offs, pad_len;
@@ -1063,7 +1067,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
 * @buf: buffer
 * @node_num: node number is returned here
 */
-static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
+                             int *node_num)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int pos = 0, node_type;
@@ -1081,7 +1086,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
 *
 * This function returns %1 if the buffer contains a node or %0 if it does not.
 */
-static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int pos = 0, node_type, node_len;
@@ -1105,7 +1110,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
        return 1;
 }
 /**
 * lpt_gc_lnum - garbage collect a LPT LEB.
 * @c: UBIFS file-system description object
@@ -1463,7 +1467,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
 #ifdef CONFIG_UBIFS_FS_DEBUG
 /**
- * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
 * @buf: buffer
 * @len: buffer length
 */
@@ -1488,7 +1492,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
        struct ubifs_nnode *nnode;
        int hght;
-        /* Entire tree is in memory so first_nnode / next_nnode are ok */
+        /* Entire tree is in memory so first_nnode / next_nnode are OK */
        nnode = first_nnode(c, &hght);
        for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
                struct ubifs_nbranch *branch;
@@ -1602,7 +1606,10 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 {
        int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
        int ret;
-        void *buf = c->dbg_buf;
+        void *buf = c->dbg->buf;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        dbg_lp("LEB %d", lnum);
        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
@@ -1704,6 +1711,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
        long long free = 0;
        int i;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        for (i = 0; i < c->lpt_lebs; i++) {
                if (c->ltab[i].tgc || c->ltab[i].cmt)
                        continue;
@@ -1716,6 +1726,8 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
                dbg_err("LPT space error: free %lld lpt_sz %lld",
                        free, c->lpt_sz);
                dbg_dump_lpt_info(c);
+                dbg_dump_lpt_lebs(c);
+                dump_stack();
                return -EINVAL;
        }
        return 0;
@@ -1731,15 +1743,19 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
+        struct ubifs_debug_info *d = c->dbg;
        long long chk_lpt_sz, lpt_sz;
        int err = 0;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        switch (action) {
        case 0:
-                c->chk_lpt_sz = 0;
+                d->chk_lpt_sz = 0;
-                c->chk_lpt_sz2 = 0;
+                d->chk_lpt_sz2 = 0;
-                c->chk_lpt_lebs = 0;
+                d->chk_lpt_lebs = 0;
-                c->chk_lpt_wastage = 0;
+                d->chk_lpt_wastage = 0;
                if (c->dirty_pn_cnt > c->pnode_cnt) {
                        dbg_err("dirty pnodes %d exceed max %d",
                                c->dirty_pn_cnt, c->pnode_cnt);
@@ -1752,35 +1768,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
                }
                return err;
        case 1:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
                return 0;
        case 2:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
-                c->chk_lpt_wastage += len;
+                d->chk_lpt_wastage += len;
-                c->chk_lpt_lebs += 1;
+                d->chk_lpt_lebs += 1;
                return 0;
        case 3:
                chk_lpt_sz = c->leb_size;
-                chk_lpt_sz *= c->chk_lpt_lebs;
+                chk_lpt_sz *= d->chk_lpt_lebs;
                chk_lpt_sz += len - c->nhead_offs;
-                if (c->chk_lpt_sz != chk_lpt_sz) {
+                if (d->chk_lpt_sz != chk_lpt_sz) {
                        dbg_err("LPT wrote %lld but space used was %lld",
-                                c->chk_lpt_sz, chk_lpt_sz);
+                                d->chk_lpt_sz, chk_lpt_sz);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz > c->lpt_sz) {
+                if (d->chk_lpt_sz > c->lpt_sz) {
                        dbg_err("LPT wrote %lld but lpt_sz is %lld",
-                                c->chk_lpt_sz, c->lpt_sz);
+                                d->chk_lpt_sz, c->lpt_sz);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+                if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
                        dbg_err("LPT layout size %lld but wrote %lld",
-                                c->chk_lpt_sz, c->chk_lpt_sz2);
+                                d->chk_lpt_sz, d->chk_lpt_sz2);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+                if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
                        dbg_err("LPT new nhead offs: expected %d was %d",
-                                c->new_nhead_offs, len);
+                                d->new_nhead_offs, len);
                        err = -EINVAL;
                }
                lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1788,26 +1804,146 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
                lpt_sz += c->ltab_sz;
                if (c->big_lpt)
                        lpt_sz += c->lsave_sz;
-                if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+                if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
                        dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
-                                c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+                                d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
                        err = -EINVAL;
                }
-                if (err)
+                if (err) {
                        dbg_dump_lpt_info(c);
-                c->chk_lpt_sz2 = c->chk_lpt_sz;
+                        dbg_dump_lpt_lebs(c);
-                c->chk_lpt_sz = 0;
+                        dump_stack();
-                c->chk_lpt_wastage = 0;
+                }
-                c->chk_lpt_lebs = 0;
+                d->chk_lpt_sz2 = d->chk_lpt_sz;
-                c->new_nhead_offs = len;
+                d->chk_lpt_sz = 0;
+                d->chk_lpt_wastage = 0;
+                d->chk_lpt_lebs = 0;
+                d->new_nhead_offs = len;
                return err;
        case 4:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
-                c->chk_lpt_wastage += len;
+                d->chk_lpt_wastage += len;
                return 0;
        default:
                return -EINVAL;
        }
 }
+/**
+ * dbg_dump_lpt_leb - dump an LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to dump
+ *
+ * This function dumps an LEB from LPT area. Nodes in this area are very
+ * different to nodes in the main area (e.g., they do not have common headers,
+ * they do not have 8-byte alignments, etc), so we have a separate function to
+ * dump LPT area LEBs. Note, LPT has to be locked by the caller.
+ */
+static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
+{
+        int err, len = c->leb_size, node_type, node_num, node_len, offs;
+        void *buf = c->dbg->buf;
+        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+               current->pid, lnum);
+        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+        if (err) {
+                ubifs_err("cannot read LEB %d, error %d", lnum, err);
+                return;
+        }
+        while (1) {
+                offs = c->leb_size - len;
+                if (!is_a_node(c, buf, len)) {
+                        int pad_len;
+                        pad_len = get_pad_len(c, buf, len);
+                        if (pad_len) {
+                                printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
+                                       lnum, offs, pad_len);
+                                buf += pad_len;
+                                len -= pad_len;
+                                continue;
+                        }
+                        if (len)
+                                printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
+                                       lnum, offs, len);
+                        break;
+                }
+                node_type = get_lpt_node_type(c, buf, &node_num);
+                switch (node_type) {
+                case UBIFS_LPT_PNODE:
+                {
+                        node_len = c->pnode_sz;
+                        if (c->big_lpt)
+                                printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
+                                       lnum, offs, node_num);
+                        else
+                                printk(KERN_DEBUG "LEB %d:%d, pnode\n",
+                                       lnum, offs);
+                        break;
+                }
+                case UBIFS_LPT_NNODE:
+                {
+                        int i;
+                        struct ubifs_nnode nnode;
+                        node_len = c->nnode_sz;
+                        if (c->big_lpt)
+                                printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
+                                       lnum, offs, node_num);
+                        else
+                                printk(KERN_DEBUG "LEB %d:%d, nnode, ",
+                                       lnum, offs);
+                        err = ubifs_unpack_nnode(c, buf, &nnode);
+                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                                printk("%d:%d", nnode.nbranch[i].lnum,
+                                       nnode.nbranch[i].offs);
+                                if (i != UBIFS_LPT_FANOUT - 1)
+                                        printk(", ");
+                        }
+                        printk("\n");
+                        break;
+                }
+                case UBIFS_LPT_LTAB:
+                        node_len = c->ltab_sz;
+                        printk(KERN_DEBUG "LEB %d:%d, ltab\n",
+                               lnum, offs);
+                        break;
+                case UBIFS_LPT_LSAVE:
+                        node_len = c->lsave_sz;
+                        printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
+                        break;
+                default:
+                        ubifs_err("LPT node type %d not recognized", node_type);
+                        return;
+                }
+                buf += node_len;
+                len -= node_len;
+        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+               current->pid, lnum);
+}
+/**
+ * dbg_dump_lpt_lebs - dump LPT lebs.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps all LPT LEBs. The caller has to make sure the LPT is
+ * locked.
+ */
+void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+{
+        int i;
+        printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
+               current->pid);
+        for (i = 0; i < c->lpt_lebs; i++)
+                dump_lpt_leb(c, i + c->lpt_first);
+        printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
+               current->pid);
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9bd5a43d452..9e6f403f170 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
                struct ubifs_scan_leb *sleb;
-                sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+                sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
                if (IS_ERR(sleb)) {
                        err = PTR_ERR(sleb);
                        break;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 21f7d047c30..ce42a7b0ca5 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                /*
                 * If the replay order was perfect the dirty space would now be
                 * zero. The order is not perfect because the the journal heads
-                 * race with eachother. This is not a problem but is does mean
+                 * race with each other. This is not a problem but is does mean
                 * that the dirty space may temporarily exceed c->leb_size
                 * during the replay.
                 */
@@ -656,7 +656,7 @@ out_dump:
 * @dirty: amount of dirty space from padding and deletion nodes
 *
 * This function inserts a reference node to the replay tree and returns zero
- * in case of success ort a negative error code in case of failure.
+ * in case of success or a negative error code in case of failure.
 */
 static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
                           unsigned long long sqnum, int free, int dirty)
@@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
                 * This means that we reached end of log and now
                 * look to the older log data, which was already
                 * committed but the eraseblock was not erased (UBIFS
-                 * only unmaps it). So this basically means we have to
+                 * only un-maps it). So this basically means we have to
                 * exit with "end of log" code.
                 */
                err = 1;
@@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c)
        if (err)
                goto out;
+        /*
+         * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+         * to roughly estimate index growth. Things like @c->min_idx_lebs
+         * depend on it. This means we have to initialize it to make sure
+         * budgeting works properly.
+         */
+        c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+        c->budg_uncommitted_idx *= c->max_idx_node_sz;
        ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
        dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
                "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 0f392351dc5..e070c643d1b 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,6 +28,7 @@
 #include "ubifs.h"
 #include <linux/random.h>
+#include <linux/math64.h>
 /*
 * Default journal size in logical eraseblocks as a percent of total
@@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
        int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
        int min_leb_cnt = UBIFS_MIN_LEB_CNT;
-        uint64_t tmp64, main_bytes;
+        long long tmp64, main_bytes;
        __le64 tmp_le64;
        /* Some functions called from here depend on the @c->key_len filed */
@@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        if (!sup)
                return -ENOMEM;
-        tmp64 = (uint64_t)max_buds * c->leb_size;
+        tmp64 = (long long)max_buds * c->leb_size;
        if (big_lpt)
                sup_flags |= UBIFS_FLG_BIGLPT;
@@ -179,14 +180,16 @@ static int create_default_filesystem(struct ubifs_info *c)
        sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
        sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
        sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
-        sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
        sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+        if (c->mount_opts.override_compr)
+                sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
+        else
+                sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
        generate_random_uuid(sup->uuid);
-        main_bytes = (uint64_t)main_lebs * c->leb_size;
+        main_bytes = (long long)main_lebs * c->leb_size;
-        tmp64 = main_bytes * DEFAULT_RP_PERCENT;
+        tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
-        do_div(tmp64, 100);
        if (tmp64 > DEFAULT_MAX_RP_SIZE)
                tmp64 = DEFAULT_MAX_RP_SIZE;
        sup->rp_size = cpu_to_le64(tmp64);
@@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
        c->fanout        = le32_to_cpu(sup->fanout);
        c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
-        c->default_compr = le16_to_cpu(sup->default_compr);
        c->rp_size       = le64_to_cpu(sup->rp_size);
        c->rp_uid        = le32_to_cpu(sup->rp_uid);
        c->rp_gid        = le32_to_cpu(sup->rp_gid);
        sup_flags        = le32_to_cpu(sup->flags);
+        if (!c->mount_opts.override_compr)
+                c->default_compr = le16_to_cpu(sup->default_compr);
        c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
        memcpy(&c->uuid, &sup->uuid, 16);
        c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
        /* Automatically increase file system size to the maximum size */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d80b2aef42b..0d7564b95f8 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -34,6 +34,8 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
 #include "ubifs.h"
 /*
@@ -417,39 +419,54 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
        else if (c->mount_opts.chk_data_crc == 1)
                seq_printf(s, ",no_chk_data_crc");
+        if (c->mount_opts.override_compr) {
+                seq_printf(s, ",compr=");
+                seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+        }
        return 0;
 }
 static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
+        int i, err;
        struct ubifs_info *c = sb->s_fs_info;
-        int i, ret = 0, err;
+        struct writeback_control wbc = {
-        long long bud_bytes;
+                .sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+                .range_start = 0,
-        if (c->jheads) {
+                .range_end   = LLONG_MAX,
-                for (i = 0; i < c->jhead_cnt; i++) {
+                .nr_to_write = LONG_MAX,
-                        err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+        };
-                        if (err && !ret)
-                                ret = err;
+        if (sb->s_flags & MS_RDONLY)
-                }
+                return 0;
-                /* Commit the journal unless it has too little data */
+        /*
-                spin_lock(&c->buds_lock);
+         * Synchronize write buffers, because 'ubifs_run_commit()' does not
-                bud_bytes = c->bud_bytes;
+         * do this if it waits for an already running commit.
-                spin_unlock(&c->buds_lock);
+         */
-                if (bud_bytes > c->leb_size) {
+        for (i = 0; i < c->jhead_cnt; i++) {
-                        err = ubifs_run_commit(c);
+                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
-                        if (err)
+                if (err)
-                                return err;
+                        return err;
-                }
        }
        /*
-         * We ought to call sync for c->ubi but it does not have one. If it had
+         * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
-         * it would in turn call mtd->sync, however mtd operations are
+         * pages, so synchronize them first, then commit the journal. Strictly
-         * synchronous anyway, so we don't lose any sleep here.
+         * speaking, it is not necessary to commit the journal here,
+         * synchronizing write-buffers would be enough. But committing makes
+         * UBIFS free space predictions much more accurate, so we want to let
+         * the user be able to get more accurate results of 'statfs()' after
+         * they synchronize the file system.
         */
-        return ret;
+        generic_sync_sb_inodes(sb, &wbc);
+        err = ubifs_run_commit(c);
+        if (err)
+                return err;
+        return ubi_sync(c->vi.ubi_num);
 }
 /**
@@ -596,7 +613,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 }
 /*
- * init_constants_late - initialize UBIFS constants.
+ * init_constants_sb - initialize UBIFS constants.
 * @c: UBIFS file-system description object
 *
 * This is a helper function which initializes various UBIFS constants after
@@ -604,10 +621,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 * makes sure they are all right. Returns zero in case of success and a
 * negative error code in case of failure.
 */
-static int init_constants_late(struct ubifs_info *c)
+static int init_constants_sb(struct ubifs_info *c)
 {
        int tmp, err;
-        uint64_t tmp64;
+        long long tmp64;
        c->main_bytes = (long long)c->main_lebs * c->leb_size;
        c->max_znode_sz = sizeof(struct ubifs_znode) +
@@ -634,9 +651,8 @@ static int init_constants_late(struct ubifs_info *c)
         * Make sure that the log is large enough to fit reference nodes for
         * all buds plus one reserved LEB.
         */
-        tmp64 = c->max_bud_bytes;
+        tmp64 = c->max_bud_bytes + c->leb_size - 1;
-        tmp = do_div(tmp64, c->leb_size);
+        c->max_bud_cnt = div_u64(tmp64, c->leb_size);
-        c->max_bud_cnt = tmp64 + !!tmp;
        tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
        tmp /= c->leb_size;
        tmp += 1;
@@ -672,7 +688,7 @@ static int init_constants_late(struct ubifs_info *c)
         * Consequently, if the journal is too small, UBIFS will treat it as
         * always full.
         */
-        tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+        tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
        if (c->bg_bud_bytes < tmp64)
                c->bg_bud_bytes = tmp64;
        if (c->max_bud_bytes < tmp64 + c->leb_size)
@@ -682,6 +698,21 @@ static int init_constants_late(struct ubifs_info *c)
        if (err)
                return err;
+        return 0;
+}
+/*
+ * init_constants_master - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the master node has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right.
+ */
+static void init_constants_master(struct ubifs_info *c)
+{
+        long long tmp64;
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        /*
@@ -690,14 +721,13 @@ static int init_constants_late(struct ubifs_info *c)
         * necessary to report something for the 'statfs()' call.
         *
         * Subtract the LEB reserved for GC, the LEB which is reserved for
-         * deletions, and assume only one journal head is available.
+         * deletions, minimum LEBs for the index, and assume only one journal
+         * head is available.
         */
-        tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+        tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
-        tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
+        tmp64 *= (long long)c->leb_size - c->leb_overhead;
        tmp64 = ubifs_reported_space(c, tmp64);
        c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
-        return 0;
 }
 /**
@@ -878,6 +908,7 @@ static int check_volume_empty(struct ubifs_info *c)
 * Opt_no_bulk_read: disable bulk-reads
 * Opt_chk_data_crc: check CRCs when reading data nodes
 * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
+ * Opt_override_compr: override default compressor
 * Opt_err: just end of array marker
 */
 enum {
@@ -887,6 +918,7 @@ enum {
        Opt_no_bulk_read,
        Opt_chk_data_crc,
        Opt_no_chk_data_crc,
+        Opt_override_compr,
        Opt_err,
 };
@@ -897,6 +929,7 @@ static const match_table_t tokens = {
        {Opt_no_bulk_read, "no_bulk_read"},
        {Opt_chk_data_crc, "chk_data_crc"},
        {Opt_no_chk_data_crc, "no_chk_data_crc"},
+        {Opt_override_compr, "compr=%s"},
        {Opt_err, NULL},
 };
@@ -950,6 +983,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                        c->mount_opts.chk_data_crc = 1;
                        c->no_chk_data_crc = 1;
                        break;
+                case Opt_override_compr:
+                {
+                        char *name = match_strdup(&args[0]);
+                        if (!name)
+                                return -ENOMEM;
+                        if (!strcmp(name, "none"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_NONE;
+                        else if (!strcmp(name, "lzo"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_LZO;
+                        else if (!strcmp(name, "zlib"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+                        else {
+                                ubifs_err("unknown compressor \"%s\"", name);
+                                kfree(name);
+                                return -EINVAL;
+                        }
+                        kfree(name);
+                        c->mount_opts.override_compr = 1;
+                        c->default_compr = c->mount_opts.compr_type;
+                        break;
+                }
                default:
                        ubifs_err("unrecognized mount option \"%s\" "
                                  "or missing value", p);
@@ -1019,6 +1074,30 @@ again:
 }
 /**
+ * check_free_space - check if there is enough free space to mount.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free space to be mounted in
+ * read/write mode. UBIFS must always have some free space to allow deletions.
+ */
+static int check_free_space(struct ubifs_info *c)
+{
+        ubifs_assert(c->dark_wm > 0);
+        if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
+                ubifs_err("insufficient free space to mount in read/write mode");
+                dbg_dump_budg(c);
+                dbg_dump_lprops(c);
+                /*
+                 * We return %-EINVAL instead of %-ENOSPC because it seems to
+                 * be the closest error code mentioned in the mount function
+                 * documentation.
+                 */
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
 * mount_ubifs - mount UBIFS file-system.
 * @c: UBIFS file-system description object
 *
@@ -1039,11 +1118,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                return err;
-#ifdef CONFIG_UBIFS_FS_DEBUG
+        err = ubifs_debugging_init(c);
-        c->dbg_buf = vmalloc(c->leb_size);
+        if (err)
-        if (!c->dbg_buf)
+                return err;
-                return -ENOMEM;
-#endif
        err = check_volume_empty(c);
        if (err)
@@ -1100,27 +1177,25 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_free;
        /*
-         * Make sure the compressor which is set as the default on in the
+         * Make sure the compressor which is set as default in the superblock
-         * superblock was actually compiled in.
+         * or overridden by mount options is actually compiled in.
         */
        if (!ubifs_compr_present(c->default_compr)) {
-                ubifs_warn("'%s' compressor is set by superblock, but not "
+                ubifs_err("'compressor \"%s\" is not compiled in",
-                           "compiled in", ubifs_compr_name(c->default_compr));
+                          ubifs_compr_name(c->default_compr));
-                c->default_compr = UBIFS_COMPR_NONE;
+                goto out_free;
        }
-        dbg_failure_mode_registration(c);
+        err = init_constants_sb(c);
-        err = init_constants_late(c);
        if (err)
-                goto out_dereg;
+                goto out_free;
        sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
        sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
        c->cbuf = kmalloc(sz, GFP_NOFS);
        if (!c->cbuf) {
                err = -ENOMEM;
-                goto out_dereg;
+                goto out_free;
        }
        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
@@ -1145,6 +1220,8 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_master;
+        init_constants_master(c);
        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
                ubifs_msg("recovery needed");
                c->need_recovery = 1;
@@ -1183,12 +1260,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (!mounted_read_only) {
                int lnum;
-                /* Check for enough free space */
+                err = check_free_space(c);
-                if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+                if (err)
-                        ubifs_err("insufficient available space");
-                        err = -EINVAL;
                        goto out_orphans;
-                }
                /* Check for enough log space */
                lnum = c->lhead_lnum + 1;
@@ -1232,6 +1306,10 @@ static int mount_ubifs(struct ubifs_info *c)
                }
        }
+        err = dbg_debugfs_init_fs(c);
+        if (err)
+                goto out_infos;
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
@@ -1283,8 +1361,20 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("tree fanout:         %d", c->fanout);
        dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
        dbg_msg("first main LEB:      %d", c->main_first);
+        dbg_msg("max. znode size      %d", c->max_znode_sz);
+        dbg_msg("max. index node size %d", c->max_idx_node_sz);
+        dbg_msg("node sizes:          data %zu, inode %zu, dentry %zu",
+                UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
+        dbg_msg("node sizes:          trun %zu, sb %zu, master %zu",
+                UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
+        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
+                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
+        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu",
+                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+                UBIFS_MAX_DENT_NODE_SZ);
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
+        dbg_msg("LEB overhead:        %d", c->leb_overhead);
        x = (long long)c->main_lebs * c->dark_wm;
        dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
                x, x >> 10, x >> 20);
@@ -1320,14 +1410,12 @@ out_wbufs:
        free_wbufs(c);
 out_cbuf:
        kfree(c->cbuf);
-out_dereg:
-        dbg_failure_mode_deregistration(c);
 out_free:
        kfree(c->bu.buf);
        vfree(c->ileb_buf);
        vfree(c->sbuf);
        kfree(c->bottom_up_buf);
-        UBIFS_DBG(vfree(c->dbg_buf));
+        ubifs_debugging_exit(c);
        return err;
 }
@@ -1345,6 +1433,7 @@ static void ubifs_umount(struct ubifs_info *c)
        dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
                c->vi.vol_id);
+        dbg_debugfs_exit_fs(c);
        spin_lock(&ubifs_infos_lock);
        list_del(&c->infos_list);
        spin_unlock(&ubifs_infos_lock);
@@ -1364,8 +1453,7 @@ static void ubifs_umount(struct ubifs_info *c)
        vfree(c->ileb_buf);
        vfree(c->sbuf);
        kfree(c->bottom_up_buf);
-        UBIFS_DBG(vfree(c->dbg_buf));
+        ubifs_debugging_exit(c);
-        dbg_failure_mode_deregistration(c);
 }
 /**
@@ -1387,12 +1475,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        c->remounting_rw = 1;
        c->always_chk_crc = 1;
-        /* Check for enough free space */
+        err = check_free_space(c);
-        if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+        if (err)
-                ubifs_err("insufficient available space");
-                err = -EINVAL;
                goto out;
-        }
        if (c->old_leb_cnt != c->leb_cnt) {
                struct ubifs_sb_node *sup;
@@ -1515,20 +1600,24 @@ out:
 * @c: UBIFS file-system description object
 *
 * This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled. It also avoids
+ * the journal unless the "fast unmount" mode is enabled.
- * committing the journal if it contains too few data.
 */
 static void commit_on_unmount(struct ubifs_info *c)
 {
-        if (!c->fast_unmount) {
+        struct super_block *sb = c->vfs_sb;
-                long long bud_bytes;
+        long long bud_bytes;
-                spin_lock(&c->buds_lock);
+        /*
-                bud_bytes = c->bud_bytes;
+         * This function is called before the background thread is stopped, so
-                spin_unlock(&c->buds_lock);
+         * we may race with ongoing commit, which means we have to take
-                if (bud_bytes > c->leb_size)
+         * @c->bud_lock to access @c->bud_bytes.
-                        ubifs_run_commit(c);
+         */
-        }
+        spin_lock(&c->buds_lock);
+        bud_bytes = c->bud_bytes;
+        spin_unlock(&c->buds_lock);
+        if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
+                ubifs_run_commit(c);
 }
 /**
@@ -1849,7 +1938,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_iput;
        mutex_unlock(&c->umount_mutex);
        return 0;
 out_iput:
@@ -1955,7 +2043,7 @@ static void ubifs_kill_sb(struct super_block *sb)
         * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
         * in order to be outside BKL.
         */
-        if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+        if (sb->s_root)
                commit_on_unmount(c);
        /* The un-mount routine is actually done in put_super() */
        generic_shutdown_super(sb);
@@ -2021,6 +2109,14 @@ static int __init ubifs_init(void)
        BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
        /*
+         * We use 2 bit wide bit-fields to store compression type, which should
+         * be amended if more compressors are added. The bit-fields are:
+         * @compr_type in 'struct ubifs_inode', @default_compr in
+         * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
+         */
+        BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
+        /*
         * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
         * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
         */
@@ -2049,11 +2145,17 @@ static int __init ubifs_init(void)
        err = ubifs_compressors_init();
        if (err)
+                goto out_shrinker;
+        err = dbg_debugfs_init();
+        if (err)
                goto out_compr;
        return 0;
 out_compr:
+        ubifs_compressors_exit();
+out_shrinker:
        unregister_shrinker(&ubifs_shrinker_info);
        kmem_cache_destroy(ubifs_inode_slab);
 out_reg:
@@ -2068,6 +2170,7 @@ static void __exit ubifs_exit(void)
        ubifs_assert(list_empty(&ubifs_infos));
        ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+        dbg_debugfs_exit();
        ubifs_compressors_exit();
        unregister_shrinker(&ubifs_shrinker_info);
        kmem_cache_destroy(ubifs_inode_slab);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6eef5344a14..f7e36f54552 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2245,12 +2245,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
                        if (found) {
                                /* Ensure the znode is dirtied */
                                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                                            znode = dirty_cow_bottom_up(c,
+                                        znode = dirty_cow_bottom_up(c, znode);
-                                                                        znode);
+                                        if (IS_ERR(znode)) {
-                                            if (IS_ERR(znode)) {
+                                                err = PTR_ERR(znode);
-                                                    err = PTR_ERR(znode);
+                                                goto out_unlock;
-                                                    goto out_unlock;
+                                        }
-                                            }
                                }
                                zbr = &znode->zbranch[n];
                                lnc_free(zbr);
@@ -2317,11 +2316,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
                /* Ensure the znode is dirtied */
                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                            znode = dirty_cow_bottom_up(c, znode);
+                        znode = dirty_cow_bottom_up(c, znode);
-                            if (IS_ERR(znode)) {
+                        if (IS_ERR(znode)) {
-                                    err = PTR_ERR(znode);
+                                err = PTR_ERR(znode);
-                                    goto out_unlock;
+                                goto out_unlock;
-                            }
+                        }
                }
                if (found == 1) {
@@ -2627,11 +2626,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
                /* Ensure the znode is dirtied */
                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                            znode = dirty_cow_bottom_up(c, znode);
+                        znode = dirty_cow_bottom_up(c, znode);
-                            if (IS_ERR(znode)) {
+                        if (IS_ERR(znode)) {
-                                    err = PTR_ERR(znode);
+                                err = PTR_ERR(znode);
-                                    goto out_unlock;
+                                goto out_unlock;
-                            }
+                        }
                }
                /* Remove all keys in range except the first */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8ac76b1c2d5..fde8d127c76 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
        }
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        c->new_ihead_lnum = lnum;
+        c->dbg->new_ihead_lnum = lnum;
-        c->new_ihead_offs = buf_offs;
+        c->dbg->new_ihead_offs = buf_offs;
 #endif
        return 0;
@@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
         * budgeting subsystem to assume the index is already committed,
         * even though it is not.
         */
+        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
        c->old_idx_sz = c->calc_idx_sz;
        c->budg_uncommitted_idx = 0;
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
        mutex_unlock(&c->tnc_mutex);
@@ -1002,7 +1004,8 @@ static int write_index(struct ubifs_info *c)
        }
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+        if (lnum != c->dbg->new_ihead_lnum ||
+            buf_offs != c->dbg->new_ihead_offs) {
                ubifs_err("inconsistent ihead");
                return -EINVAL;
        }
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0b378042a3a..b25fc36cf72 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -51,6 +51,13 @@
 */
 #define UBIFS_MIN_COMPR_LEN 128
+/*
+ * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
+ * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * node uncompress, because it'll be read faster.
+ */
+#define UBIFS_MIN_COMPRESS_DIFF 64
 /* Root inode number */
 #define UBIFS_ROOT_INO 1
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 46b172560a0..fc2a4cc66d0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -63,6 +63,14 @@
 #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
 #define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
+/*
+ * Minimum amount of LEBs reserved for the index. At present the index needs at
+ * least 2 LEBs: one for the index head and one for in-the-gaps method (which
+ * currently does not cater for the index head and so excludes it from
+ * consideration).
+ */
+#define MIN_INDEX_LEBS 2
 /* Minimum amount of data UBIFS writes to the flash */
 #define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
@@ -386,12 +394,12 @@ struct ubifs_inode {
        unsigned int dirty:1;
        unsigned int xattr:1;
        unsigned int bulk_read:1;
+        unsigned int compr_type:2;
        struct mutex ui_mutex;
        spinlock_t ui_lock;
        loff_t synced_i_size;
        loff_t ui_size;
        int flags;
-        int compr_type;
        pgoff_t last_page_read;
        pgoff_t read_in_a_row;
        int data_len;
@@ -419,7 +427,7 @@ struct ubifs_unclean_leb {
 *
 * LPROPS_UNCAT: not categorized
 * LPROPS_DIRTY: dirty > 0, not index
- * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
 * LPROPS_FREE: free > 0, not empty, not index
 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
 * LPROPS_EMPTY: LEB is empty, not taken
@@ -473,8 +481,8 @@ struct ubifs_lprops {
 struct ubifs_lpt_lprops {
        int free;
        int dirty;
-        unsigned tgc : 1;
+        unsigned tgc:1;
-        unsigned cmt : 1;
+        unsigned cmt:1;
 };
 /**
@@ -482,24 +490,26 @@ struct ubifs_lpt_lprops {
 * @empty_lebs: number of empty LEBs
 * @taken_empty_lebs: number of taken LEBs
 * @idx_lebs: number of indexing LEBs
- * @total_free: total free space in bytes
+ * @total_free: total free space in bytes (includes all LEBs)
- * @total_dirty: total dirty space in bytes
+ * @total_dirty: total dirty space in bytes (includes all LEBs)
- * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_used: total used space in bytes (does not include index LEBs)
- * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (does not include index LEBs)
- * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (does not include index LEBs)
+ *
+ * The @taken_empty_lebs field counts the LEBs that are in the transient state
+ * of having been "taken" for use but not yet written to. @taken_empty_lebs is
+ * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
+ * used by itself (in which case 'unused_lebs' would be a better name). In the
+ * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
+ * by GC, but unlike other empty LEBs that are "taken", it may not be written
+ * straight away (i.e. before the next commit start or unmount), so either
+ * @gc_lnum must be specially accounted for, or the current approach followed
+ * i.e. count it under @taken_empty_lebs.
 *
- * N.B. total_dirty and total_used are different to other total_* fields,
+ * @empty_lebs includes @taken_empty_lebs.
- * because they account _all_ LEBs, not just data LEBs.
 *
- * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
+ * @total_used, @total_dead and @total_dark fields do not account indexing
- * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
+ * LEBs.
- * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
- * by itself (in which case 'unused_lebs' would be a better name). In the case
- * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
- * but unlike other empty LEBs that are 'taken', it may not be written straight
- * away (i.e. before the next commit start or unmount), so either gc_lnum must
- * be specially accounted for, or the current approach followed i.e. count it
- * under 'taken_empty_lebs'.
 */
 struct ubifs_lp_stats {
        int empty_lebs;
@@ -893,15 +903,25 @@ struct ubifs_orphan {
 /**
 * struct ubifs_mount_opts - UBIFS-specific mount options information.
 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
- * @bulk_read: enable bulk-reads
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
- * @chk_data_crc: check CRCs when reading data nodes
+ * @chk_data_crc: enable/disable CRC data checking when reading data nodes
+ *                (%0 default, %1 disabe, %2 enable)
+ * @override_compr: override default compressor (%0 - do not override and use
+ *                  superblock compressor, %1 - override and use compressor
+ *                  specified in @compr_type)
+ * @compr_type: compressor type to override the superblock compressor with
+ *              (%UBIFS_COMPR_NONE, etc)
 */
 struct ubifs_mount_opts {
        unsigned int unmount_mode:2;
        unsigned int bulk_read:2;
        unsigned int chk_data_crc:2;
+        unsigned int override_compr:1;
+        unsigned int compr_type:2;
 };
+struct ubifs_debug_info;
 /**
 * struct ubifs_info - UBIFS file-system description data structure
 * (per-superblock).
@@ -946,6 +966,7 @@ struct ubifs_mount_opts {
 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
 *                   recovery)
 * @bulk_read: enable bulk-reads
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
 *
 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
 *             @calc_idx_sz
@@ -963,8 +984,6 @@ struct ubifs_mount_opts {
 * @ileb_nxt: next pre-allocated index LEBs
 * @old_idx: tree of index nodes obsoleted since the last commit start
 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
 *
 * @mst_node: master node
 * @mst_offs: offset of valid master node
@@ -986,7 +1005,6 @@ struct ubifs_mount_opts {
 * @main_lebs: count of LEBs in the main area
 * @main_first: first LEB of the main area
 * @main_bytes: main area size in bytes
- * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
 *
 * @key_hash_type: type of the key hash
 * @key_hash: direntry key hash function
@@ -1149,15 +1167,7 @@ struct ubifs_mount_opts {
 * @always_chk_crc: always check CRCs (while mounting and remounting rw)
 * @mount_opts: UBIFS-specific mount options
 *
- * @dbg_buf: a buffer of LEB size used for debugging purposes
+ * @dbg: debugging-related information
- * @old_zroot: old index root - used by 'dbg_check_old_index()'
- * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
- * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
- * @failure_mode: failure mode for recovery testing
- * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
- * @fail_timeout: time in jiffies when delay of failure mode expires
- * @fail_cnt: current number of calls to failure mode I/O functions
- * @fail_cnt_max: number of calls by which to delay failure mode
 */
 struct ubifs_info {
        struct super_block *vfs_sb;
@@ -1196,6 +1206,7 @@ struct ubifs_info {
        unsigned int big_lpt:1;
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
+        unsigned int default_compr:2;
        struct mutex tnc_mutex;
        struct ubifs_zbranch zroot;
@@ -1212,10 +1223,6 @@ struct ubifs_info {
        int ileb_nxt;
        struct rb_root old_idx;
        int *bottom_up_buf;
-#ifdef CONFIG_UBIFS_FS_DEBUG
-        int new_ihead_lnum;
-        int new_ihead_offs;
-#endif
        struct ubifs_mst_node *mst_node;
        int mst_offs;
@@ -1237,7 +1244,6 @@ struct ubifs_info {
        int main_lebs;
        int main_first;
        long long main_bytes;
-        int default_compr;
        uint8_t key_hash_type;
        uint32_t (*key_hash)(const char *str, int len);
@@ -1315,8 +1321,8 @@ struct ubifs_info {
        void *sbuf;
        struct list_head idx_gc;
        int idx_gc_cnt;
-        volatile int gc_seq;
+        int gc_seq;
-        volatile int gced_lnum;
+        int gced_lnum;
        struct list_head infos_list;
        struct mutex umount_mutex;
@@ -1391,21 +1397,7 @@ struct ubifs_info {
        struct ubifs_mount_opts mount_opts;
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        void *dbg_buf;
+        struct ubifs_debug_info *dbg;
-        struct ubifs_zbranch old_zroot;
-        int old_zroot_level;
-        unsigned long long old_zroot_sqnum;
-        int failure_mode;
-        int fail_delay;
-        unsigned long fail_timeout;
-        unsigned int fail_cnt;
-        unsigned int fail_cnt_max;
-        long long chk_lpt_sz;
-        long long chk_lpt_sz2;
-        long long chk_lpt_wastage;
-        int chk_lpt_lebs;
-        int new_nhead_lnum;
-        int new_nhead_offs;
 #endif
 };
@@ -1505,7 +1497,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
+long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
@@ -1639,6 +1631,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
 void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
 uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
 struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* Needed only in debugging code in lpt_commit.c */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+                       struct ubifs_nnode *nnode);
 /* lpt_commit.c */
 int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1714,7 +1709,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* compressor.c */
 int __init ubifs_compressors_init(void);
-void __exit ubifs_compressors_exit(void);
+void ubifs_compressors_exit(void);
 void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
                    int *compr_type);
 int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index a4f2b3ce45b..31fc84297dd 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -126,13 +126,13 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        }
        mutex_unlock(&sbi->s_alloc_mutex);
        inode->i_mode = mode;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else {
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        }
        iinfo->i_location.logicalBlockNum = block;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 082409cd4b8..f84bfaa8d94 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -604,7 +604,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
                goto out;
        iinfo = UDF_I(inode);
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        init_special_inode(inode, mode, rdev);
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index ac181f6806a..6f5dcf00609 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -304,13 +304,13 @@ cg_found:
        inode->i_ino = cg * uspi->s_ipg + bit;
        inode->i_mode = mode;
-        inode->i_uid = current->fsuid;
+        inode->i_uid = current_fsuid();
        if (dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;
                if (S_ISDIR(mode))
                        inode->i_mode |= S_ISGID;
        } else
-                inode->i_gid = current->fsgid;
+                inode->i_gid = current_fsgid();
        inode->i_blocks = 0;
        inode->i_generation = 0;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 737c9a42536..c3dc491fff8 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -85,13 +85,13 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_trans_inode.o \
                                   xfs_trans_item.o \
                                   xfs_utils.o \
-                                   xfs_vfsops.o \
                                   xfs_vnodeops.o \
                                   xfs_rw.o \
                                   xfs_dmops.o \
                                   xfs_qmops.o
-xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
+xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o \
+                                   xfs_dir2_trace.o
 # Objects in linux/
 xfs-y                           += $(addprefix $(XFS_LINUX)/, \
@@ -106,7 +106,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   xfs_iops.o \
                                   xfs_lrw.o \
                                   xfs_super.o \
-                                   xfs_vnode.o \
+                                   xfs_sync.o \
                                   xfs_xattr.o)
 # Objects in support/
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
index 351a8f454bd..4dfc7c37081 100644
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -32,23 +32,15 @@ typedef struct sv_s {
        wait_queue_head_t waiters;
 } sv_t;
-#define SV_FIFO         0x0             /* sv_t is FIFO type */
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-#define SV_LIFO         0x2             /* sv_t is LIFO type */
-#define SV_PRIO         0x4             /* sv_t is PRIO type */
-#define SV_KEYED        0x6             /* sv_t is KEYED type */
-#define SV_DEFAULT      SV_FIFO
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
-                             unsigned long timeout)
 {
        DECLARE_WAITQUEUE(wait, current);
        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(state);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
        spin_unlock(lock);
-        schedule_timeout(timeout);
+        schedule();
        remove_wait_queue(&sv->waiters, &wait);
 }
@@ -58,13 +50,7 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
 #define sv_destroy(sv) \
        /*NOTHING*/
 #define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+        _sv_wait(sv, lock)
-#define sv_wait_sig(sv, pri, lock, s)   \
-        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
-        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
-        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
 #define sv_signal(sv) \
        wake_up(&(sv)->waiters)
 #define sv_broadcast(sv) \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a44d68eb50b..de3a198f771 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -42,6 +42,40 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC          37
+#define to_ioend_wq(v)  (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+void __init
+xfs_ioend_init(void)
+{
+        int i;
+        for (i = 0; i < NVSYNC; i++)
+                init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+void
+xfs_ioend_wait(
+        xfs_inode_t     *ip)
+{
+        wait_queue_head_t *wq = to_ioend_wq(ip);
+        wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+STATIC void
+xfs_ioend_wake(
+        xfs_inode_t     *ip)
+{
+        if (atomic_dec_and_test(&ip->i_iocount))
+                wake_up(to_ioend_wq(ip));
+}
 STATIC void
 xfs_count_page_state(
        struct page             *page,
@@ -146,16 +180,25 @@ xfs_destroy_ioend(
        xfs_ioend_t             *ioend)
 {
        struct buffer_head      *bh, *next;
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        for (bh = ioend->io_buffer_head; bh; bh = next) {
                next = bh->b_private;
                bh->b_end_io(bh, !ioend->io_error);
        }
-        if (unlikely(ioend->io_error)) {
-                vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
+        /*
-                                __FILE__,__LINE__);
+         * Volume managers supporting multiple paths can send back ENODEV
+         * when the final path disappears.  In this case continuing to fill
+         * the page cache with dirty data which cannot be written out is
+         * evil, so prevent that.
+         */
+        if (unlikely(ioend->io_error == -ENODEV)) {
+                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+                                      __FILE__, __LINE__);
        }
-        vn_iowake(XFS_I(ioend->io_inode));
+        xfs_ioend_wake(ip);
        mempool_free(ioend, xfs_ioend_pool);
 }
@@ -191,7 +234,7 @@ xfs_setfilesize(
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
                ip->i_update_size = 1;
-                mark_inode_dirty_sync(ioend->io_inode);
+                xfs_mark_inode_dirty_sync(ip);
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -317,14 +360,9 @@ xfs_map_blocks(
        xfs_iomap_t             *mapp,
        int                     flags)
 {
-        xfs_inode_t             *ip = XFS_I(inode);
+        int                     nmaps = 1;
-        int                     error, nmaps = 1;
+        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
-        error = xfs_iomap(ip, offset, count,
-                                flags, mapp, &nmaps);
-        if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
-                xfs_iflags_set(ip, XFS_IMODIFIED);
-        return -error;
 }
 STATIC_INLINE int
@@ -512,7 +550,7 @@ xfs_cancel_ioend(
                        unlock_buffer(bh);
                } while ((bh = next_bh) != NULL);
-                vn_iowake(XFS_I(ioend->io_inode));
+                xfs_ioend_wake(XFS_I(ioend->io_inode));
                mempool_free(ioend, xfs_ioend_pool);
        } while ((ioend = next) != NULL);
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 3ba0631a381..7b26f5ff969 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -43,4 +43,7 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 36d5fcd3f59..cb329edc925 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -630,6 +630,29 @@ xfs_buf_get_flags(
        return NULL;
 }
+STATIC int
+_xfs_buf_read(
+        xfs_buf_t               *bp,
+        xfs_buf_flags_t         flags)
+{
+        int                     status;
+        XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
+        ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
+                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        status = xfs_buf_iorequest(bp);
+        if (!status && !(flags & XBF_ASYNC))
+                status = xfs_buf_iowait(bp);
+        return status;
+}
 xfs_buf_t *
 xfs_buf_read_flags(
        xfs_buftarg_t           *target,
@@ -646,7 +669,7 @@ xfs_buf_read_flags(
                if (!XFS_BUF_ISDONE(bp)) {
                        XB_TRACE(bp, "read", (unsigned long)flags);
                        XFS_STATS_INC(xb_get_read);
-                        xfs_buf_iostart(bp, flags);
+                        _xfs_buf_read(bp, flags);
                } else if (flags & XBF_ASYNC) {
                        XB_TRACE(bp, "read_async", (unsigned long)flags);
                        /*
@@ -1048,50 +1071,39 @@ xfs_buf_ioerror(
        XB_TRACE(bp, "ioerror", (unsigned long)error);
 }
-/*
- *      Initiate I/O on a buffer, based on the flags supplied.
- *      The b_iodone routine in the buffer supplied will only be called
- *      when all of the subsidiary I/O requests, if any, have been completed.
- */
 int
-xfs_buf_iostart(
+xfs_bawrite(
-        xfs_buf_t               *bp,
+        void                    *mp,
-        xfs_buf_flags_t         flags)
+        struct xfs_buf          *bp)
 {
-        int                     status = 0;
+        XB_TRACE(bp, "bawrite", 0);
-        XB_TRACE(bp, "iostart", (unsigned long)flags);
+        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
-        if (flags & XBF_DELWRI) {
+        xfs_buf_delwri_dequeue(bp);
-                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
-                bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
-                xfs_buf_delwri_queue(bp, 1);
-                return 0;
-        }
-        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+        bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
-                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-        bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
-                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_mount = mp;
+        bp->b_strat = xfs_bdstrat_cb;
+        return xfs_bdstrat_cb(bp);
+}
-        BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
+void
+xfs_bdwrite(
+        void                    *mp,
+        struct xfs_buf          *bp)
+{
+        XB_TRACE(bp, "bdwrite", 0);
-        /* For writes allow an alternate strategy routine to precede
+        bp->b_strat = xfs_bdstrat_cb;
-         * the actual I/O request (which may not be issued at all in
+        bp->b_mount = mp;
-         * a shutdown situation, for example).
-         */
-        status = (flags & XBF_WRITE) ?
-                xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
-        /* Wait for I/O if we are not an async request.
+        bp->b_flags &= ~XBF_READ;
-         * Note: async I/O request completion will release the buffer,
+        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-         * and that can already be done by this point.  So using the
-         * buffer pointer from here on, after async I/O, is invalid.
-         */
-        if (!status && !(flags & XBF_ASYNC))
-                status = xfs_buf_iowait(bp);
-        return status;
+        xfs_buf_delwri_queue(bp, 1);
 }
 STATIC_INLINE void
@@ -1114,8 +1126,7 @@ xfs_buf_bio_end_io(
        unsigned int            blocksize = bp->b_target->bt_bsize;
        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+        xfs_buf_ioerror(bp, -error);
-                bp->b_error = EIO;
        do {
                struct page     *page = bvec->bv_page;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 456519a088c..288ae7c4c80 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -168,7 +168,7 @@ typedef struct xfs_buf {
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
-        void                    *b_fspriv3;
+        struct xfs_mount        *b_mount;
        unsigned short          b_error;        /* error code on I/O */
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
@@ -214,9 +214,10 @@ extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
+extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 extern void xfs_buf_ioend(xfs_buf_t *,  int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
@@ -311,10 +312,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_UNORDERED(bp)   ((bp)->b_flags &= ~XBF_ORDERED)
 #define XFS_BUF_ISORDERED(bp)   ((bp)->b_flags & XBF_ORDERED)
-#define XFS_BUF_SHUT(bp)        do { } while (0)
-#define XFS_BUF_UNSHUT(bp)      do { } while (0)
-#define XFS_BUF_ISSHUT(bp)      (0)
 #define XFS_BUF_HOLD(bp)        xfs_buf_hold(bp)
 #define XFS_BUF_READ(bp)        ((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)      ((bp)->b_flags &= ~XBF_READ)
@@ -334,8 +331,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_SET_FSPRIVATE(bp, val)          ((bp)->b_fspriv = (void*)(val))
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type)            ((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val)         ((bp)->b_fspriv3 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
 #define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
@@ -366,14 +361,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_TARGET(bp)              ((bp)->b_target)
 #define XFS_BUFTARG_NAME(target)        xfs_buf_target_name(target)
-static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
-{
-        bp->b_fspriv3 = mp;
-        bp->b_strat = xfs_bdstrat_cb;
-        xfs_buf_delwri_dequeue(bp);
-        return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-}
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
        if (!bp->b_relse)
@@ -414,17 +401,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
        return error;
 }
-/*
- * No error can be returned from xfs_buf_iostart for delwri
- * buffers as they are queued and no I/O is issued.
- */
-static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
-{
-        bp->b_strat = xfs_bdstrat_cb;
-        bp->b_fspriv3 = mp;
-        (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
-}
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
 #define xfs_iowait(bp)  xfs_buf_iowait(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 652721ce0ea..55bddf3b609 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -23,16 +23,6 @@
 /*
 * Credentials
 */
-typedef struct cred {
+typedef const struct cred cred_t;
-        /* EMPTY */
-} cred_t;
-extern struct cred *sys_cred;
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static inline int capable_cred(cred_t *cr, int cid)
-{
-        return (cr == sys_cred) ? 1 : capable(cid);
-}
 #endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 7f7abec25e1..595751f7835 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,7 +29,6 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_vfsops.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 3fee790f138..e14c4e3aea0 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -36,89 +36,54 @@
 #include "xfs_inode.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_ioctl32.h"
 #include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
 #include <linux/dcache.h>
 #include <linux/smp_lock.h>
 static struct vm_operations_struct xfs_file_vm_ops;
-STATIC_INLINE ssize_t
+STATIC ssize_t
-__xfs_file_read(
+xfs_file_aio_read(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-        int                     ioflags,
        loff_t                  pos)
 {
        struct file             *file = iocb->ki_filp;
+        int                     ioflags = IO_ISAIO;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+        if (file->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
        return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
                                nr_segs, &iocb->ki_pos, ioflags);
 }
 STATIC ssize_t
-xfs_file_aio_read(
+xfs_file_aio_write(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-STATIC ssize_t
-xfs_file_aio_read_invis(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-STATIC_INLINE ssize_t
-__xfs_file_write(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-        int                     ioflags,
        loff_t                  pos)
 {
-        struct file     *file = iocb->ki_filp;
+        struct file             *file = iocb->ki_filp;
+        int                     ioflags = IO_ISAIO;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+        if (file->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
        return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
                                &iocb->ki_pos, ioflags);
 }
 STATIC ssize_t
-xfs_file_aio_write(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-STATIC ssize_t
-xfs_file_aio_write_invis(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-STATIC ssize_t
 xfs_file_splice_read(
        struct file             *infilp,
        loff_t                  *ppos,
@@ -126,20 +91,13 @@ xfs_file_splice_read(
        size_t                  len,
        unsigned int            flags)
 {
-        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
+        int                     ioflags = 0;
-                                   infilp, ppos, pipe, len, flags, 0);
-}
+        if (infilp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_read_invis(
-        struct file             *infilp,
-        loff_t                  *ppos,
-        struct pipe_inode_info  *pipe,
-        size_t                  len,
-        unsigned int            flags)
-{
        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-                                   infilp, ppos, pipe, len, flags, IO_INVIS);
+                                   infilp, ppos, pipe, len, flags, ioflags);
 }
 STATIC ssize_t
@@ -150,30 +108,49 @@ xfs_file_splice_write(
        size_t                  len,
        unsigned int            flags)
 {
-        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
+        int                     ioflags = 0;
-                                    pipe, outfilp, ppos, len, flags, 0);
-}
+        if (outfilp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_write_invis(
-        struct pipe_inode_info  *pipe,
-        struct file             *outfilp,
-        loff_t                  *ppos,
-        size_t                  len,
-        unsigned int            flags)
-{
        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-                                    pipe, outfilp, ppos, len, flags, IO_INVIS);
+                                    pipe, outfilp, ppos, len, flags, ioflags);
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
-        struct file     *filp)
+        struct file     *file)
 {
-        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EFBIG;
-        return -xfs_open(XFS_I(inode));
+        if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+                return -EIO;
+        return 0;
+}
+STATIC int
+xfs_dir_open(
+        struct inode    *inode,
+        struct file     *file)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        int             mode;
+        int             error;
+        error = xfs_file_open(inode, file);
+        if (error)
+                return error;
+        /*
+         * If there are any blocks, read-ahead block 0 as we're almost
+         * certain to have the next operation be a read there.
+         */
+        mode = xfs_ilock_map_shared(ip);
+        if (ip->i_d.di_nextents > 0)
+                xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+        xfs_iunlock(ip, mode);
+        return 0;
 }
 STATIC int
@@ -227,7 +204,7 @@ xfs_file_readdir(
         * point we can change the ->readdir prototype to include the
         * buffer size.
         */
-        bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size);
+        bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
        error = xfs_readdir(ip, dirent, bufsize,
                                (xfs_off_t *)&filp->f_pos, filldir);
@@ -248,48 +225,6 @@ xfs_file_mmap(
        return 0;
 }
-STATIC long
-xfs_file_ioctl(
-        struct file     *filp,
-        unsigned int    cmd,
-        unsigned long   p)
-{
-        int             error;
-        struct inode    *inode = filp->f_path.dentry->d_inode;
-        error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-        /* NOTE:  some of the ioctl's return positive #'s as a
-         *        byte count indicating success, such as
-         *        readlink_by_handle.  So we don't "sign flip"
-         *        like most other routines.  This means true
-         *        errors need to be returned as a negative value.
-         */
-        return error;
-}
-STATIC long
-xfs_file_ioctl_invis(
-        struct file     *filp,
-        unsigned int    cmd,
-        unsigned long   p)
-{
-        int             error;
-        struct inode    *inode = filp->f_path.dentry->d_inode;
-        error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p);
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-        /* NOTE:  some of the ioctl's return positive #'s as a
-         *        byte count indicating success, such as
-         *        readlink_by_handle.  So we don't "sign flip"
-         *        like most other routines.  This means true
-         *        errors need to be returned as a negative value.
-         */
-        return error;
-}
 /*
 * mmap()d file has taken write protection fault and is being made
 * writable. We can set the page state up correctly for a writable
@@ -325,26 +260,8 @@ const struct file_operations xfs_file_operations = {
 #endif
 };
-const struct file_operations xfs_invis_file_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = do_sync_read,
-        .write          = do_sync_write,
-        .aio_read       = xfs_file_aio_read_invis,
-        .aio_write      = xfs_file_aio_write_invis,
-        .splice_read    = xfs_file_splice_read_invis,
-        .splice_write   = xfs_file_splice_write_invis,
-        .unlocked_ioctl = xfs_file_ioctl_invis,
-#ifdef CONFIG_COMPAT
-        .compat_ioctl   = xfs_file_compat_invis_ioctl,
-#endif
-        .mmap           = xfs_file_mmap,
-        .open           = xfs_file_open,
-        .release        = xfs_file_release,
-        .fsync          = xfs_file_fsync,
-};
 const struct file_operations xfs_dir_file_operations = {
+        .open           = xfs_dir_open,
        .read           = generic_read_dir,
        .readdir        = xfs_file_readdir,
        .llseek         = generic_file_llseek,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 36caa6d957d..5aeb7777696 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -24,6 +24,10 @@ int  fs_noerr(void) { return 0; }
 int  fs_nosys(void) { return ENOSYS; }
 void fs_noval(void) { return; }
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
 void
 xfs_tosspages(
        xfs_inode_t     *ip,
@@ -53,7 +57,7 @@ xfs_flushinval_pages(
                if (!ret)
                        truncate_inode_pages(mapping, first);
        }
-        return ret;
+        return -ret;
 }
 int
@@ -72,10 +76,23 @@ xfs_flush_pages(
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
                ret = filemap_fdatawrite(mapping);
                if (flags & XFS_B_ASYNC)
-                        return ret;
+                        return -ret;
                ret2 = filemap_fdatawait(mapping);
                if (!ret)
                        ret = ret2;
        }
-        return ret;
+        return -ret;
+}
+int
+xfs_wait_on_pages(
+        xfs_inode_t     *ip,
+        xfs_off_t       first,
+        xfs_off_t       last)
+{
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
+        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+                return -filemap_fdatawait(mapping);
+        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index ef90e64641e..2ae8b1ccb02 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -26,7 +26,6 @@
 */
 xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
-        .restrict_chown = {     0,              1,              1       },
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
        .panic_mask     = {     0,              0,              255     },
@@ -43,10 +42,3 @@ xfs_param_t xfs_params = {
        .inherit_nodfrg = {     0,              1,              1       },
        .fstrm_timer    = {     1,              30*100,         3600*100},
 };
-/*
- * Global system credential structure.
- */
-static cred_t sys_cred_val;
-cred_t *sys_cred = &sys_cred_val;
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
index 2770b0085ee..69f71caf061 100644
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -19,6 +19,5 @@
 #define __XFS_GLOBALS_H__
 extern uint64_t xfs_panic_mask;         /* set to cause more panics */
-extern struct cred *sys_cred;
 #endif  /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index d3438c72dca..67205f6198b 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -68,26 +68,22 @@
 * XFS_IOC_PATH_TO_HANDLE
 *    returns full handle for a path
 */
-STATIC int
+int
 xfs_find_handle(
        unsigned int            cmd,
-        void                    __user *arg)
+        xfs_fsop_handlereq_t    *hreq)
 {
        int                     hsize;
        xfs_handle_t            handle;
-        xfs_fsop_handlereq_t    hreq;
        struct inode            *inode;
-        if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                return -XFS_ERROR(EFAULT);
        memset((char *)&handle, 0, sizeof(handle));
        switch (cmd) {
        case XFS_IOC_PATH_TO_FSHANDLE:
        case XFS_IOC_PATH_TO_HANDLE: {
                struct path path;
-                int error = user_lpath((const char __user *)hreq.path, &path);
+                int error = user_lpath((const char __user *)hreq->path, &path);
                if (error)
                        return error;
@@ -101,7 +97,7 @@ xfs_find_handle(
        case XFS_IOC_FD_TO_HANDLE: {
                struct file     *file;
-                file = fget(hreq.fd);
+                file = fget(hreq->fd);
                if (!file)
                    return -EBADF;
@@ -158,8 +154,8 @@ xfs_find_handle(
        }
        /* now copy our handle into the user buffer & write out the size */
-        if (copy_to_user(hreq.ohandle, &handle, hsize) ||
+        if (copy_to_user(hreq->ohandle, &handle, hsize) ||
-            copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+            copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
                iput(inode);
                return -XFS_ERROR(EFAULT);
        }
@@ -249,27 +245,25 @@ xfs_vget_fsop_handlereq(
        return 0;
 }
-STATIC int
+int
 xfs_open_by_handle(
        xfs_mount_t             *mp,
-        void                    __user *arg,
+        xfs_fsop_handlereq_t    *hreq,
        struct file             *parfilp,
        struct inode            *parinode)
 {
+        const struct cred       *cred = current_cred();
        int                     error;
        int                     new_fd;
        int                     permflag;
        struct file             *filp;
        struct inode            *inode;
        struct dentry           *dentry;
-        xfs_fsop_handlereq_t    hreq;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
@@ -280,10 +274,10 @@ xfs_open_by_handle(
        }
 #if BITS_PER_LONG != 32
-        hreq.oflags |= O_LARGEFILE;
+        hreq->oflags |= O_LARGEFILE;
 #endif
        /* Put open permission in namei format. */
-        permflag = hreq.oflags;
+        permflag = hreq->oflags;
        if ((permflag+1) & O_ACCMODE)
                permflag++;
        if (permflag & O_TRUNC)
@@ -321,15 +315,16 @@ xfs_open_by_handle(
        mntget(parfilp->f_path.mnt);
        /* Create file pointer. */
-        filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags);
+        filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
        if (IS_ERR(filp)) {
                put_unused_fd(new_fd);
                return -XFS_ERROR(-PTR_ERR(filp));
        }
        if (inode->i_mode & S_IFREG) {
                /* invisible operation should not change atime */
                filp->f_flags |= O_NOATIME;
-                filp->f_op = &xfs_invis_file_operations;
+                filp->f_mode |= FMODE_NOCMTIME;
        }
        fd_install(new_fd, filp);
@@ -362,24 +357,21 @@ do_readlink(
 }
-STATIC int
+int
 xfs_readlink_by_handle(
        xfs_mount_t             *mp,
-        void                    __user *arg,
+        xfs_fsop_handlereq_t    *hreq,
        struct inode            *parinode)
 {
        struct inode            *inode;
-        xfs_fsop_handlereq_t    hreq;
        __u32                   olen;
        void                    *link;
        int                     error;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
@@ -389,7 +381,7 @@ xfs_readlink_by_handle(
                goto out_iput;
        }
-        if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+        if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
                error = -XFS_ERROR(EFAULT);
                goto out_iput;
        }
@@ -401,7 +393,7 @@ xfs_readlink_by_handle(
        error = -xfs_readlink(XFS_I(inode), link);
        if (error)
                goto out_kfree;
-        error = do_readlink(hreq.ohandle, olen, link);
+        error = do_readlink(hreq->ohandle, olen, link);
        if (error)
                goto out_kfree;
@@ -500,7 +492,7 @@ xfs_attrlist_by_handle(
        return -error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_get(
        struct inode            *inode,
        char                    *name,
@@ -529,7 +521,7 @@ xfs_attrmulti_attr_get(
        return error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_set(
        struct inode            *inode,
        char                    *name,
@@ -559,7 +551,7 @@ xfs_attrmulti_attr_set(
        return error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_remove(
        struct inode            *inode,
        char                    *name,
@@ -661,19 +653,26 @@ xfs_attrmulti_by_handle(
        return -error;
 }
-STATIC int
+int
 xfs_ioc_space(
        struct xfs_inode        *ip,
        struct inode            *inode,
        struct file             *filp,
        int                     ioflags,
        unsigned int            cmd,
-        void                    __user *arg)
+        xfs_flock64_t           *bf)
 {
-        xfs_flock64_t           bf;
        int                     attr_flags = 0;
        int                     error;
+        /*
+         * Only allow the sys admin to reserve space unless
+         * unwritten extents are enabled.
+         */
+        if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+            !capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
                return -XFS_ERROR(EPERM);
@@ -683,16 +682,12 @@ xfs_ioc_space(
        if (!S_ISREG(inode->i_mode))
                return -XFS_ERROR(EINVAL);
-        if (copy_from_user(&bf, arg, sizeof(bf)))
-                return -XFS_ERROR(EFAULT);
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                attr_flags |= XFS_ATTR_NONBLOCK;
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
-        error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
+        error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
-                                              NULL, attr_flags);
        return -error;
 }
@@ -1007,7 +1002,7 @@ xfs_ioctl_setattr(
         * to the file owner ID, except in cases where the
         * CAP_FSETID capability is applicable.
         */
-        if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+        if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
                code = XFS_ERROR(EPERM);
                goto error_return;
        }
@@ -1104,10 +1099,6 @@ xfs_ioctl_setattr(
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & FSX_PROJID) {
                /*
@@ -1136,7 +1127,7 @@ xfs_ioctl_setattr(
                         * the superblock version number since projids didn't
                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
                         */
-                        if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+                        if (ip->i_d.di_version == 1)
                                xfs_bump_ino_vers2(tp, ip);
                }
@@ -1255,43 +1246,67 @@ xfs_ioc_setxflags(
 }
 STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+        struct getbmap __user   *base = *ap;
+        /* copy only getbmap portion (not getbmapx) */
+        if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+                return XFS_ERROR(EFAULT);
+        *ap += sizeof(struct getbmap);
+        return 0;
+}
+STATIC int
 xfs_ioc_getbmap(
        struct xfs_inode        *ip,
        int                     ioflags,
        unsigned int            cmd,
        void                    __user *arg)
 {
-        struct getbmap          bm;
+        struct getbmapx         bmx;
-        int                     iflags;
        int                     error;
-        if (copy_from_user(&bm, arg, sizeof(bm)))
+        if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
                return -XFS_ERROR(EFAULT);
-        if (bm.bmv_count < 2)
+        if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
-        iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+        bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
        if (ioflags & IO_INVIS)
-                iflags |= BMV_IF_NO_DMAPI_READ;
+                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
-        error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags);
+        error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+                            (struct getbmap *)arg+1);
        if (error)
                return -error;
-        if (copy_to_user(arg, &bm, sizeof(bm)))
+        /* copy back header - only size of getbmap */
+        if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
 STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+        struct getbmapx __user  *base = *ap;
+        if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+                return XFS_ERROR(EFAULT);
+        *ap += sizeof(struct getbmapx);
+        return 0;
+}
+STATIC int
 xfs_ioc_getbmapx(
        struct xfs_inode        *ip,
        void                    __user *arg)
 {
        struct getbmapx         bmx;
-        struct getbmap          bm;
-        int                     iflags;
        int                     error;
        if (copy_from_user(&bmx, arg, sizeof(bmx)))
@@ -1300,46 +1315,46 @@ xfs_ioc_getbmapx(
        if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
-        /*
+        if (bmx.bmv_iflags & (~BMV_IF_VALID))
-         * Map input getbmapx structure to a getbmap
-         * structure for xfs_getbmap.
-         */
-        GETBMAP_CONVERT(bmx, bm);
-        iflags = bmx.bmv_iflags;
-        if (iflags & (~BMV_IF_VALID))
                return -XFS_ERROR(EINVAL);
-        iflags |= BMV_IF_EXTENDED;
+        error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+                            (struct getbmapx *)arg+1);
-        error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags);
        if (error)
                return -error;
-        GETBMAP_CONVERT(bm, bmx);
+        /* copy back header */
+        if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
-        if (copy_to_user(arg, &bmx, sizeof(bmx)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
-int
+/*
-xfs_ioctl(
+ * Note: some of the ioctl's return positive numbers as a
-        xfs_inode_t             *ip,
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
        struct file             *filp,
-        int                     ioflags,
        unsigned int            cmd,
-        void                    __user *arg)
+        unsigned long           p)
 {
        struct inode            *inode = filp->f_path.dentry->d_inode;
-        xfs_mount_t             *mp = ip->i_mount;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        void                    __user *arg = (void __user *)p;
+        int                     ioflags = 0;
        int                     error;
-        xfs_itrace_entry(XFS_I(inode));
+        if (filp->f_mode & FMODE_NOCMTIME)
-        switch (cmd) {
+                ioflags |= IO_INVIS;
+        xfs_itrace_entry(ip);
+        switch (cmd) {
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -1347,17 +1362,13 @@ xfs_ioctl(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64:
+        case XFS_IOC_UNRESVSP64: {
-                /*
+                xfs_flock64_t           bf;
-                 * Only allow the sys admin to reserve space unless
-                 * unwritten extents are enabled.
-                 */
-                if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
-                    !capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+                if (copy_from_user(&bf, arg, sizeof(bf)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+        }
        case XFS_IOC_DIOINFO: {
                struct dioattr  da;
                xfs_buftarg_t   *target =
@@ -1417,18 +1428,30 @@ xfs_ioctl(
        case XFS_IOC_FD_TO_HANDLE:
        case XFS_IOC_PATH_TO_HANDLE:
-        case XFS_IOC_PATH_TO_FSHANDLE:
+        case XFS_IOC_PATH_TO_FSHANDLE: {
-                return xfs_find_handle(cmd, arg);
+                xfs_fsop_handlereq_t    hreq;
-        case XFS_IOC_OPEN_BY_HANDLE:
+                if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                return xfs_open_by_handle(mp, arg, filp, inode);
+                        return -XFS_ERROR(EFAULT);
+                return xfs_find_handle(cmd, &hreq);
+        }
+        case XFS_IOC_OPEN_BY_HANDLE: {
+                xfs_fsop_handlereq_t    hreq;
+                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_open_by_handle(mp, &hreq, filp, inode);
+        }
        case XFS_IOC_FSSETDM_BY_HANDLE:
                return xfs_fssetdm_by_handle(mp, arg, inode);
-        case XFS_IOC_READLINK_BY_HANDLE:
+        case XFS_IOC_READLINK_BY_HANDLE: {
-                return xfs_readlink_by_handle(mp, arg, inode);
+                xfs_fsop_handlereq_t    hreq;
+                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_readlink_by_handle(mp, &hreq, inode);
+        }
        case XFS_IOC_ATTRLIST_BY_HANDLE:
                return xfs_attrlist_by_handle(mp, arg, inode);
@@ -1436,7 +1459,11 @@ xfs_ioctl(
                return xfs_attrmulti_by_handle(mp, arg, filp, inode);
        case XFS_IOC_SWAPEXT: {
-                error = xfs_swapext((struct xfs_swapext __user *)arg);
+                struct xfs_swapext      sxp;
+                if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_swapext(&sxp);
                return -error;
        }
@@ -1492,9 +1519,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSDATA: {
                xfs_growfs_data_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
@@ -1505,9 +1529,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSLOG: {
                xfs_growfs_log_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
@@ -1518,9 +1539,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSRT: {
                xfs_growfs_rt_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644
index 00000000000..8c16bf2d7e0
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+extern int
+xfs_ioc_space(
+        struct xfs_inode        *ip,
+        struct inode            *inode,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        xfs_flock64_t           *bf);
+extern int
+xfs_find_handle(
+        unsigned int            cmd,
+        xfs_fsop_handlereq_t    *hreq);
+extern int
+xfs_open_by_handle(
+        xfs_mount_t             *mp,
+        xfs_fsop_handlereq_t    *hreq,
+        struct file             *parfilp,
+        struct inode            *parinode);
+extern int
+xfs_readlink_by_handle(
+        xfs_mount_t             *mp,
+        xfs_fsop_handlereq_t    *hreq,
+        struct inode            *parinode);
+extern int
+xfs_attrmulti_attr_get(
+        struct inode            *inode,
+        char                    *name,
+        char                    __user *ubuf,
+        __uint32_t              *len,
+        __uint32_t              flags);
+extern int
+        xfs_attrmulti_attr_set(
+        struct inode            *inode,
+        char                    *name,
+        const char              __user *ubuf,
+        __uint32_t              len,
+        __uint32_t              flags);
+extern int
+xfs_attrmulti_attr_remove(
+        struct inode            *inode,
+        char                    *name,
+        __uint32_t              flags);
+extern long
+xfs_file_ioctl(
+        struct file             *filp,
+        unsigned int            cmd,
+        unsigned long           p);
+extern long
+xfs_file_compat_ioctl(
+        struct file             *file,
+        unsigned int            cmd,
+        unsigned long           arg);
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index a4b254eb43b..0504cece9f6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -16,11 +16,7 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include <linux/compat.h>
-#include <linux/init.h>
 #include <linux/ioctl.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/fs.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -36,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_vfs.h"
 #include "xfs_vnode.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -44,221 +39,219 @@
 #include "xfs_error.h"
 #include "xfs_dfrag.h"
 #include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
 #include "xfs_ioctl32.h"
 #define  _NATIVE_IOC(cmd, type) \
          _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#ifdef BROKEN_X86_ALIGNMENT
-#define BROKEN_X86_ALIGNMENT
+STATIC int
-#define _PACKED __attribute__((packed))
+xfs_compat_flock64_copyin(
-/* on ia32 l_start is on a 32-bit boundary */
+        xfs_flock64_t           *bf,
-typedef struct xfs_flock64_32 {
+        compat_xfs_flock64_t    __user *arg32)
-        __s16           l_type;
-        __s16           l_whence;
-        __s64           l_start __attribute__((packed));
-                        /* len == 0 means until end of file */
-        __s64           l_len __attribute__((packed));
-        __s32           l_sysid;
-        __u32           l_pid;
-        __s32           l_pad[4];       /* reserve area */
-} xfs_flock64_32_t;
-#define XFS_IOC_ALLOCSP_32      _IOW ('X', 10, struct xfs_flock64_32)
-#define XFS_IOC_FREESP_32       _IOW ('X', 11, struct xfs_flock64_32)
-#define XFS_IOC_ALLOCSP64_32    _IOW ('X', 36, struct xfs_flock64_32)
-#define XFS_IOC_FREESP64_32     _IOW ('X', 37, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP_32       _IOW ('X', 40, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP_32     _IOW ('X', 41, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP64_32     _IOW ('X', 42, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP64_32   _IOW ('X', 43, struct xfs_flock64_32)
-/* just account for different alignment */
-STATIC unsigned long
-xfs_ioctl32_flock(
-        unsigned long           arg)
 {
-        xfs_flock64_32_t        __user *p32 = (void __user *)arg;
+        if (get_user(bf->l_type,        &arg32->l_type) ||
-        xfs_flock64_t           __user *p = compat_alloc_user_space(sizeof(*p));
+            get_user(bf->l_whence,      &arg32->l_whence) ||
+            get_user(bf->l_start,       &arg32->l_start) ||
-        if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
+            get_user(bf->l_len,         &arg32->l_len) ||
-            copy_in_user(&p->l_whence,  &p32->l_whence, sizeof(s16)) ||
+            get_user(bf->l_sysid,       &arg32->l_sysid) ||
-            copy_in_user(&p->l_start,   &p32->l_start,  sizeof(s64)) ||
+            get_user(bf->l_pid,         &arg32->l_pid) ||
-            copy_in_user(&p->l_len,     &p32->l_len,    sizeof(s64)) ||
+            copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
-            copy_in_user(&p->l_sysid,   &p32->l_sysid,  sizeof(s32)) ||
+                return -XFS_ERROR(EFAULT);
-            copy_in_user(&p->l_pid,     &p32->l_pid,    sizeof(u32)) ||
+        return 0;
-            copy_in_user(&p->l_pad,     &p32->l_pad,    4*sizeof(u32)))
-                return -EFAULT;
-        return (unsigned long)p;
 }
-typedef struct compat_xfs_fsop_geom_v1 {
+STATIC int
-        __u32           blocksize;      /* filesystem (data) block size */
+xfs_compat_ioc_fsgeometry_v1(
-        __u32           rtextsize;      /* realtime extent size         */
+        struct xfs_mount          *mp,
-        __u32           agblocks;       /* fsblocks in an AG            */
+        compat_xfs_fsop_geom_v1_t __user *arg32)
-        __u32           agcount;        /* number of allocation groups  */
-        __u32           logblocks;      /* fsblocks in the log          */
-        __u32           sectsize;       /* (data) sector size, bytes    */
-        __u32           inodesize;      /* inode size in bytes          */
-        __u32           imaxpct;        /* max allowed inode space(%)   */
-        __u64           datablocks;     /* fsblocks in data subvolume   */
-        __u64           rtblocks;       /* fsblocks in realtime subvol  */
-        __u64           rtextents;      /* rt extents in realtime subvol*/
-        __u64           logstart;       /* starting fsblock of the log  */
-        unsigned char   uuid[16];       /* unique id of the filesystem  */
-        __u32           sunit;          /* stripe unit, fsblocks        */
-        __u32           swidth;         /* stripe width, fsblocks       */
-        __s32           version;        /* structure version            */
-        __u32           flags;          /* superblock version flags     */
-        __u32           logsectsize;    /* log sector size, bytes       */
-        __u32           rtsectsize;     /* realtime sector size, bytes  */
-        __u32           dirblocksize;   /* directory block size, bytes  */
-} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
-#define XFS_IOC_FSGEOMETRY_V1_32  \
-        _IOR ('X', 100, struct compat_xfs_fsop_geom_v1)
-STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg)
 {
-        compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg;
+        xfs_fsop_geom_t           fsgeo;
-        xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p));
+        int                       error;
-        if (copy_in_user(p, p32, sizeof(*p32)))
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
-                return -EFAULT;
+        if (error)
-        return (unsigned long)p;
+                return -error;
+        /* The 32-bit variant simply has some padding at the end */
+        if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
 }
-typedef struct compat_xfs_inogrp {
+STATIC int
-        __u64           xi_startino;    /* starting inode number        */
+xfs_compat_growfs_data_copyin(
-        __s32           xi_alloccount;  /* # bits set in allocmask      */
+        struct xfs_growfs_data   *in,
-        __u64           xi_allocmask;   /* mask of allocated inodes     */
+        compat_xfs_growfs_data_t __user *arg32)
-} __attribute__((packed)) compat_xfs_inogrp_t;
-STATIC int xfs_inumbers_fmt_compat(
-        void __user *ubuffer,
-        const xfs_inogrp_t *buffer,
-        long count,
-        long *written)
 {
-        compat_xfs_inogrp_t __user *p32 = ubuffer;
+        if (get_user(in->newblocks, &arg32->newblocks) ||
-        long i;
+            get_user(in->imaxpct,   &arg32->imaxpct))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_compat_growfs_rt_copyin(
+        struct xfs_growfs_rt     *in,
+        compat_xfs_growfs_rt_t  __user *arg32)
+{
+        if (get_user(in->newblocks, &arg32->newblocks) ||
+            get_user(in->extsize,   &arg32->extsize))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_inumbers_fmt_compat(
+        void                    __user *ubuffer,
+        const xfs_inogrp_t      *buffer,
+        long                    count,
+        long                    *written)
+{
+        compat_xfs_inogrp_t     __user *p32 = ubuffer;
+        long                    i;
        for (i = 0; i < count; i++) {
                if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
                    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
                    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-                        return -EFAULT;
+                        return -XFS_ERROR(EFAULT);
        }
        *written = count * sizeof(*p32);
        return 0;
 }
 #else
 #define xfs_inumbers_fmt_compat xfs_inumbers_fmt
-#define _PACKED
+#endif  /* BROKEN_X86_ALIGNMENT */
-#endif
+STATIC int
+xfs_ioctl32_bstime_copyin(
+        xfs_bstime_t            *bstime,
+        compat_xfs_bstime_t     __user *bstime32)
+{
+        compat_time_t           sec32;  /* tv_sec differs on 64 vs. 32 */
-/* XFS_IOC_FSBULKSTAT and friends */
+        if (get_user(sec32,             &bstime32->tv_sec)      ||
+            get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
+                return -XFS_ERROR(EFAULT);
+        bstime->tv_sec = sec32;
+        return 0;
+}
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+        xfs_bstat_t             *bstat,
+        compat_xfs_bstat_t      __user *bstat32)
+{
+        if (get_user(bstat->bs_ino,     &bstat32->bs_ino)       ||
+            get_user(bstat->bs_mode,    &bstat32->bs_mode)      ||
+            get_user(bstat->bs_nlink,   &bstat32->bs_nlink)     ||
+            get_user(bstat->bs_uid,     &bstat32->bs_uid)       ||
+            get_user(bstat->bs_gid,     &bstat32->bs_gid)       ||
+            get_user(bstat->bs_rdev,    &bstat32->bs_rdev)      ||
+            get_user(bstat->bs_blksize, &bstat32->bs_blksize)   ||
+            get_user(bstat->bs_size,    &bstat32->bs_size)      ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+            get_user(bstat->bs_blocks,  &bstat32->bs_size)      ||
+            get_user(bstat->bs_xflags,  &bstat32->bs_size)      ||
+            get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
+            get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
+            get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
+            get_user(bstat->bs_projid,  &bstat32->bs_projid)    ||
+            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
+            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
+            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
-typedef struct compat_xfs_bstime {
+/* XFS_IOC_FSBULKSTAT and friends */
-        __s32           tv_sec;         /* seconds              */
-        __s32           tv_nsec;        /* and nanoseconds      */
-} compat_xfs_bstime_t;
-STATIC int xfs_bstime_store_compat(
+STATIC int
-        compat_xfs_bstime_t __user *p32,
+xfs_bstime_store_compat(
-        const xfs_bstime_t *p)
+        compat_xfs_bstime_t     __user *p32,
+        const xfs_bstime_t      *p)
 {
-        __s32 sec32;
+        __s32                   sec32;
        sec32 = p->tv_sec;
        if (put_user(sec32, &p32->tv_sec) ||
            put_user(p->tv_nsec, &p32->tv_nsec))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        return 0;
 }
-typedef struct compat_xfs_bstat {
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
-        __u64           bs_ino;         /* inode number                 */
+STATIC int
-        __u16           bs_mode;        /* type and mode                */
+xfs_bulkstat_one_fmt_compat(
-        __u16           bs_nlink;       /* number of links              */
-        __u32           bs_uid;         /* user id                      */
-        __u32           bs_gid;         /* group id                     */
-        __u32           bs_rdev;        /* device value                 */
-        __s32           bs_blksize;     /* block size                   */
-        __s64           bs_size;        /* file size                    */
-        compat_xfs_bstime_t bs_atime;   /* access time                  */
-        compat_xfs_bstime_t bs_mtime;   /* modify time                  */
-        compat_xfs_bstime_t bs_ctime;   /* inode change time            */
-        int64_t         bs_blocks;      /* number of blocks             */
-        __u32           bs_xflags;      /* extended flags               */
-        __s32           bs_extsize;     /* extent size                  */
-        __s32           bs_extents;     /* number of extents            */
-        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
-        unsigned char   bs_pad[14];     /* pad space, unused            */
-        __u32           bs_dmevmask;    /* DMIG event mask              */
-        __u16           bs_dmstate;     /* DMIG state info              */
-        __u16           bs_aextents;    /* attribute number of extents  */
-} _PACKED compat_xfs_bstat_t;
-STATIC int xfs_bulkstat_one_fmt_compat(
        void                    __user *ubuffer,
+        int                     ubsize,
+        int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
-        compat_xfs_bstat_t __user *p32 = ubuffer;
+        compat_xfs_bstat_t      __user *p32 = ubuffer;
-        if (put_user(buffer->bs_ino, &p32->bs_ino) ||
+        if (ubsize < sizeof(*p32))
-            put_user(buffer->bs_mode, &p32->bs_mode) ||
+                return XFS_ERROR(ENOMEM);
-            put_user(buffer->bs_nlink, &p32->bs_nlink) ||
-            put_user(buffer->bs_uid, &p32->bs_uid) ||
+        if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
-            put_user(buffer->bs_gid, &p32->bs_gid) ||
+            put_user(buffer->bs_mode,     &p32->bs_mode)        ||
-            put_user(buffer->bs_rdev, &p32->bs_rdev) ||
+            put_user(buffer->bs_nlink,    &p32->bs_nlink)       ||
-            put_user(buffer->bs_blksize, &p32->bs_blksize) ||
+            put_user(buffer->bs_uid,      &p32->bs_uid)         ||
-            put_user(buffer->bs_size, &p32->bs_size) ||
+            put_user(buffer->bs_gid,      &p32->bs_gid)         ||
+            put_user(buffer->bs_rdev,     &p32->bs_rdev)        ||
+            put_user(buffer->bs_blksize,  &p32->bs_blksize)     ||
+            put_user(buffer->bs_size,     &p32->bs_size)        ||
            xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
            xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
            xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
-            put_user(buffer->bs_blocks, &p32->bs_blocks) ||
+            put_user(buffer->bs_blocks,   &p32->bs_blocks)      ||
-            put_user(buffer->bs_xflags, &p32->bs_xflags) ||
+            put_user(buffer->bs_xflags,   &p32->bs_xflags)      ||
-            put_user(buffer->bs_extsize, &p32->bs_extsize) ||
+            put_user(buffer->bs_extsize,  &p32->bs_extsize)     ||
-            put_user(buffer->bs_extents, &p32->bs_extents) ||
+            put_user(buffer->bs_extents,  &p32->bs_extents)     ||
-            put_user(buffer->bs_gen, &p32->bs_gen) ||
+            put_user(buffer->bs_gen,      &p32->bs_gen)         ||
-            put_user(buffer->bs_projid, &p32->bs_projid) ||
+            put_user(buffer->bs_projid,   &p32->bs_projid)      ||
-            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
+            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
-            put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
+            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
-                return -EFAULT;
+                return XFS_ERROR(EFAULT);
-        return sizeof(*p32);
+        if (ubused)
+                *ubused = sizeof(*p32);
+        return 0;
 }
+STATIC int
+xfs_bulkstat_one_compat(
-typedef struct compat_xfs_fsop_bulkreq {
+        xfs_mount_t     *mp,            /* mount point for filesystem */
-        compat_uptr_t   lastip;         /* last inode # pointer         */
+        xfs_ino_t       ino,            /* inode number to get data for */
-        __s32           icount;         /* count of entries in buffer   */
+        void            __user *buffer, /* buffer to place output in */
-        compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+        int             ubsize,         /* size of buffer */
-        compat_uptr_t   ocount;         /* output count pointer         */
+        void            *private_data,  /* my private data */
-} compat_xfs_fsop_bulkreq_t;
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        int             *ubused,        /* bytes used by me */
-#define XFS_IOC_FSBULKSTAT_32 \
+        void            *dibuff,        /* on-disk inode buffer */
-        _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+        int             *stat)          /* BULKSTAT_RV_... */
-#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+{
-        _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-#define XFS_IOC_FSINUMBERS_32 \
+                                    xfs_bulkstat_one_fmt_compat, bno,
-        _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+                                    ubused, dibuff, stat);
+}
 /* copied from xfs_ioctl.c */
 STATIC int
-xfs_ioc_bulkstat_compat(
+xfs_compat_ioc_bulkstat(
-        xfs_mount_t             *mp,
+        xfs_mount_t               *mp,
-        unsigned int            cmd,
+        unsigned int              cmd,
-        void                    __user *arg)
+        compat_xfs_fsop_bulkreq_t __user *p32)
 {
-        compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg;
        u32                     addr;
        xfs_fsop_bulkreq_t      bulkreq;
        int                     count;  /* # of records returned */
@@ -270,20 +263,20 @@ xfs_ioc_bulkstat_compat(
        /* should be called again (unused here, but used in dmapi) */
        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
+                return -XFS_ERROR(EPERM);
        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
        if (get_user(addr, &p32->lastip))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.lastip = compat_ptr(addr);
        if (get_user(bulkreq.icount, &p32->icount) ||
            get_user(addr, &p32->ubuffer))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.ubuffer = compat_ptr(addr);
        if (get_user(addr, &p32->ocount))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.ocount = compat_ptr(addr);
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
@@ -295,17 +288,22 @@ xfs_ioc_bulkstat_compat(
        if (bulkreq.ubuffer == NULL)
                return -XFS_ERROR(EINVAL);
-        if (cmd == XFS_IOC_FSINUMBERS)
+        if (cmd == XFS_IOC_FSINUMBERS_32) {
                error = xfs_inumbers(mp, &inlast, &count,
                                bulkreq.ubuffer, xfs_inumbers_fmt_compat);
-        else {
+        } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
-                /* declare a var to get a warning in case the type changes */
+                int res;
-                bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat;
+                error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+                                sizeof(compat_xfs_bstat_t),
+                                NULL, 0, NULL, NULL, &res);
+        } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
                error = xfs_bulkstat(mp, &inlast, &count,
-                        xfs_bulkstat_one, formatter,
+                        xfs_bulkstat_one_compat, NULL,
                        sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
                        BULKSTAT_FG_QUICK, &done);
-        }
+        } else
+                error = XFS_ERROR(EINVAL);
        if (error)
                return -error;
@@ -321,63 +319,306 @@ xfs_ioc_bulkstat_compat(
        return 0;
 }
+STATIC int
+xfs_compat_handlereq_copyin(
+        xfs_fsop_handlereq_t            *hreq,
+        compat_xfs_fsop_handlereq_t     __user *arg32)
+{
+        compat_xfs_fsop_handlereq_t     hreq32;
+        if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        hreq->fd = hreq32.fd;
+        hreq->path = compat_ptr(hreq32.path);
+        hreq->oflags = hreq32.oflags;
+        hreq->ihandle = compat_ptr(hreq32.ihandle);
+        hreq->ihandlen = hreq32.ihandlen;
+        hreq->ohandle = compat_ptr(hreq32.ohandle);
+        hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+        return 0;
+}
-typedef struct compat_xfs_fsop_handlereq {
+/*
-        __u32           fd;             /* fd for FD_TO_HANDLE          */
+ * Convert userspace handle data into inode.
-        compat_uptr_t   path;           /* user pathname                */
+ *
-        __u32           oflags;         /* open flags                   */
+ * We use the fact that all the fsop_handlereq ioctl calls have a data
-        compat_uptr_t   ihandle;        /* user supplied handle         */
+ * structure argument whose first component is always a xfs_fsop_handlereq_t,
-        __u32           ihandlen;       /* user supplied length         */
+ * so we can pass that sub structure into this handy, shared routine.
-        compat_uptr_t   ohandle;        /* user buffer for handle       */
+ *
-        compat_uptr_t   ohandlen;       /* user buffer length           */
+ * If no error, caller must always iput the returned inode.
-} compat_xfs_fsop_handlereq_t;
+ */
+STATIC int
-#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+xfs_vget_fsop_handlereq_compat(
-        _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+        xfs_mount_t             *mp,
-#define XFS_IOC_PATH_TO_HANDLE_32 \
+        struct inode            *parinode,      /* parent inode pointer    */
-        _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+        compat_xfs_fsop_handlereq_t     *hreq,
-#define XFS_IOC_FD_TO_HANDLE_32 \
+        struct inode            **inode)
-        _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_OPEN_BY_HANDLE_32 \
-        _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_READLINK_BY_HANDLE_32 \
-        _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
-STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg)
 {
-        compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg;
+        void                    __user *hanp;
-        xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p));
+        size_t                  hlen;
-        u32 addr;
+        xfs_fid_t               *xfid;
+        xfs_handle_t            *handlep;
-        if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) ||
+        xfs_handle_t            handle;
-            get_user(addr, &p32->path) ||
+        xfs_inode_t             *ip;
-            put_user(compat_ptr(addr), &p->path) ||
+        xfs_ino_t               ino;
-            copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) ||
+        __u32                   igen;
-            get_user(addr, &p32->ihandle) ||
+        int                     error;
-            put_user(compat_ptr(addr), &p->ihandle) ||
-            copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) ||
+        /*
-            get_user(addr, &p32->ohandle) ||
+         * Only allow handle opens under a directory.
-            put_user(compat_ptr(addr), &p->ohandle) ||
+         */
-            get_user(addr, &p32->ohandlen) ||
+        if (!S_ISDIR(parinode->i_mode))
-            put_user(compat_ptr(addr), &p->ohandlen))
+                return XFS_ERROR(ENOTDIR);
-                return -EFAULT;
+        hanp = compat_ptr(hreq->ihandle);
-        return (unsigned long)p;
+        hlen = hreq->ihandlen;
+        handlep = &handle;
+        if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+                return XFS_ERROR(EINVAL);
+        if (copy_from_user(handlep, hanp, hlen))
+                return XFS_ERROR(EFAULT);
+        if (hlen < sizeof(*handlep))
+                memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
+        if (hlen > sizeof(handlep->ha_fsid)) {
+                if (handlep->ha_fid.fid_len !=
+                    (hlen - sizeof(handlep->ha_fsid) -
+                            sizeof(handlep->ha_fid.fid_len)) ||
+                    handlep->ha_fid.fid_pad)
+                        return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Crack the handle, obtain the inode # & generation #
+         */
+        xfid = (struct xfs_fid *)&handlep->ha_fid;
+        if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
+                ino  = xfid->fid_ino;
+                igen = xfid->fid_gen;
+        } else {
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Get the XFS inode, building a Linux inode to go with it.
+         */
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+        if (error)
+                return error;
+        if (ip == NULL)
+                return XFS_ERROR(EIO);
+        if (ip->i_d.di_gen != igen) {
+                xfs_iput_new(ip, XFS_ILOCK_SHARED);
+                return XFS_ERROR(ENOENT);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        *inode = VFS_I(ip);
+        return 0;
 }
+STATIC int
+xfs_compat_attrlist_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct inode            *parinode)
+{
+        int                     error;
+        attrlist_cursor_kern_t  *cursor;
+        compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+        struct inode            *inode;
+        char                    *kbuf;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&al_hreq, arg,
+                           sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        if (al_hreq.buflen > XATTR_LIST_MAX)
+                return -XFS_ERROR(EINVAL);
+        /*
+         * Reject flags, only allow namespaces.
+         */
+        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+                return -XFS_ERROR(EINVAL);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
+                                               &inode);
+        if (error)
+                goto out;
+        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        if (!kbuf)
+                goto out_vn_rele;
+        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+        error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+                                        al_hreq.flags, cursor);
+        if (error)
+                goto out_kfree;
+        if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+                error = -EFAULT;
+ out_kfree:
+        kfree(kbuf);
+ out_vn_rele:
+        iput(inode);
+ out:
+        return -error;
+}
-STATIC long
+STATIC int
-xfs_compat_ioctl(
+xfs_compat_attrmulti_by_handle(
-        int             mode,
+        xfs_mount_t                             *mp,
-        struct file     *file,
+        void                                    __user *arg,
-        unsigned        cmd,
+        struct inode                            *parinode)
-        unsigned long   arg)
+{
+        int                                     error;
+        compat_xfs_attr_multiop_t               *ops;
+        compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
+        struct inode                            *inode;
+        unsigned int                            i, size;
+        char                                    *attr_name;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&am_hreq, arg,
+                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
+                                               &inode);
+        if (error)
+                goto out;
+        error = E2BIG;
+        size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+        if (!size || size > 16 * PAGE_SIZE)
+                goto out_vn_rele;
+        error = ENOMEM;
+        ops = kmalloc(size, GFP_KERNEL);
+        if (!ops)
+                goto out_vn_rele;
+        error = EFAULT;
+        if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
+                goto out_kfree_ops;
+        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+        if (!attr_name)
+                goto out_kfree_ops;
+        error = 0;
+        for (i = 0; i < am_hreq.opcount; i++) {
+                ops[i].am_error = strncpy_from_user(attr_name,
+                                compat_ptr(ops[i].am_attrname),
+                                MAXNAMELEN);
+                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+                        error = -ERANGE;
+                if (ops[i].am_error < 0)
+                        break;
+                switch (ops[i].am_opcode) {
+                case ATTR_OP_GET:
+                        ops[i].am_error = xfs_attrmulti_attr_get(inode,
+                                        attr_name,
+                                        compat_ptr(ops[i].am_attrvalue),
+                                        &ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_SET:
+                        ops[i].am_error = xfs_attrmulti_attr_set(inode,
+                                        attr_name,
+                                        compat_ptr(ops[i].am_attrvalue),
+                                        ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_REMOVE:
+                        ops[i].am_error = xfs_attrmulti_attr_remove(inode,
+                                        attr_name, ops[i].am_flags);
+                        break;
+                default:
+                        ops[i].am_error = EINVAL;
+                }
+        }
+        if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+                error = XFS_ERROR(EFAULT);
+        kfree(attr_name);
+ out_kfree_ops:
+        kfree(ops);
+ out_vn_rele:
+        iput(inode);
+ out:
+        return -error;
+}
+STATIC int
+xfs_compat_fssetdm_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct inode            *parinode)
+{
+        int                     error;
+        struct fsdmidata        fsd;
+        compat_xfs_fsop_setdm_handlereq_t dmhreq;
+        struct inode            *inode;
+        if (!capable(CAP_MKNOD))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&dmhreq, arg,
+                           sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
+                                               &inode);
+        if (error)
+                return -error;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+                error = -XFS_ERROR(EPERM);
+                goto out;
+        }
+        if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+                error = -XFS_ERROR(EFAULT);
+                goto out;
+        }
+        error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+                                 fsd.fsd_dmstate);
+out:
+        iput(inode);
+        return error;
+}
+long
+xfs_file_compat_ioctl(
+        struct file             *filp,
+        unsigned                cmd,
+        unsigned long           p)
 {
-        struct inode    *inode = file->f_path.dentry->d_inode;
+        struct inode            *inode = filp->f_path.dentry->d_inode;
-        int             error;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        void                    __user *arg = (void __user *)p;
+        int                     ioflags = 0;
+        int                     error;
+        if (filp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
+        xfs_itrace_entry(ip);
        switch (cmd) {
+        /* No size or alignment issues on any arch */
        case XFS_IOC_DIOINFO:
        case XFS_IOC_FSGEOMETRY:
        case XFS_IOC_FSGETXATTR:
@@ -387,48 +628,18 @@ xfs_compat_ioctl(
        case XFS_IOC_GETBMAP:
        case XFS_IOC_GETBMAPA:
        case XFS_IOC_GETBMAPX:
-/* not handled
-        case XFS_IOC_FSSETDM_BY_HANDLE:
-        case XFS_IOC_ATTRLIST_BY_HANDLE:
-        case XFS_IOC_ATTRMULTI_BY_HANDLE:
-*/
        case XFS_IOC_FSCOUNTS:
        case XFS_IOC_SET_RESBLKS:
        case XFS_IOC_GET_RESBLKS:
-        case XFS_IOC_FSGROWFSDATA:
        case XFS_IOC_FSGROWFSLOG:
-        case XFS_IOC_FSGROWFSRT:
        case XFS_IOC_FREEZE:
        case XFS_IOC_THAW:
        case XFS_IOC_GOINGDOWN:
        case XFS_IOC_ERROR_INJECTION:
        case XFS_IOC_ERROR_CLEARALL:
-                break;
+                return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
-        case XFS_IOC32_GETXFLAGS:
+        /* These are handled fine if no alignment issues */
-        case XFS_IOC32_SETXFLAGS:
-        case XFS_IOC32_GETVERSION:
-                cmd = _NATIVE_IOC(cmd, long);
-                break;
-#ifdef BROKEN_X86_ALIGNMENT
-        /* xfs_flock_t has wrong u32 vs u64 alignment */
-        case XFS_IOC_ALLOCSP_32:
-        case XFS_IOC_FREESP_32:
-        case XFS_IOC_ALLOCSP64_32:
-        case XFS_IOC_FREESP64_32:
-        case XFS_IOC_RESVSP_32:
-        case XFS_IOC_UNRESVSP_32:
-        case XFS_IOC_RESVSP64_32:
-        case XFS_IOC_UNRESVSP64_32:
-                arg = xfs_ioctl32_flock(arg);
-                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-                break;
-        case XFS_IOC_FSGEOMETRY_V1_32:
-                arg = xfs_ioctl32_geom_v1(arg);
-                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1);
-                break;
-#else /* These are handled fine if no alignment issues */
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -438,51 +649,97 @@ xfs_compat_ioctl(
        case XFS_IOC_RESVSP64:
        case XFS_IOC_UNRESVSP64:
        case XFS_IOC_FSGEOMETRY_V1:
-                break;
+        case XFS_IOC_FSGROWFSDATA:
+        case XFS_IOC_FSGROWFSRT:
+                return xfs_file_ioctl(filp, cmd, p);
+#else
+        case XFS_IOC_ALLOCSP_32:
+        case XFS_IOC_FREESP_32:
+        case XFS_IOC_ALLOCSP64_32:
+        case XFS_IOC_FREESP64_32:
+        case XFS_IOC_RESVSP_32:
+        case XFS_IOC_UNRESVSP_32:
+        case XFS_IOC_RESVSP64_32:
+        case XFS_IOC_UNRESVSP64_32: {
+                struct xfs_flock64      bf;
-        /* xfs_bstat_t still has wrong u32 vs u64 alignment */
+                if (xfs_compat_flock64_copyin(&bf, arg))
-        case XFS_IOC_SWAPEXT:
+                        return -XFS_ERROR(EFAULT);
-                break;
+                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+        }
+        case XFS_IOC_FSGEOMETRY_V1_32:
+                return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+        case XFS_IOC_FSGROWFSDATA_32: {
+                struct xfs_growfs_data  in;
+                if (xfs_compat_growfs_data_copyin(&in, arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_data(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSRT_32: {
+                struct xfs_growfs_rt    in;
+                if (xfs_compat_growfs_rt_copyin(&in, arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_rt(mp, &in);
+                return -error;
+        }
 #endif
+        /* long changes size, but xfs only copiese out 32 bits */
+        case XFS_IOC_GETXFLAGS_32:
+        case XFS_IOC_SETXFLAGS_32:
+        case XFS_IOC_GETVERSION_32:
+                cmd = _NATIVE_IOC(cmd, long);
+                return xfs_file_ioctl(filp, cmd, p);
+        case XFS_IOC_SWAPEXT: {
+                struct xfs_swapext        sxp;
+                struct compat_xfs_swapext __user *sxu = arg;
+                /* Bulk copy in up to the sx_stat field, then copy bstat */
+                if (copy_from_user(&sxp, sxu,
+                                   offsetof(struct xfs_swapext, sx_stat)) ||
+                    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_swapext(&sxp);
+                return -error;
+        }
        case XFS_IOC_FSBULKSTAT_32:
        case XFS_IOC_FSBULKSTAT_SINGLE_32:
        case XFS_IOC_FSINUMBERS_32:
-                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq);
+                return xfs_compat_ioc_bulkstat(mp, cmd, arg);
-                return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount,
-                                cmd, (void __user*)arg);
        case XFS_IOC_FD_TO_HANDLE_32:
        case XFS_IOC_PATH_TO_HANDLE_32:
-        case XFS_IOC_PATH_TO_FSHANDLE_32:
+        case XFS_IOC_PATH_TO_FSHANDLE_32: {
-        case XFS_IOC_OPEN_BY_HANDLE_32:
+                struct xfs_fsop_handlereq       hreq;
-        case XFS_IOC_READLINK_BY_HANDLE_32:
-                arg = xfs_ioctl32_fshandle(arg);
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
+                        return -XFS_ERROR(EFAULT);
                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
-                break;
+                return xfs_find_handle(cmd, &hreq);
-        default:
-                return -ENOIOCTLCMD;
        }
+        case XFS_IOC_OPEN_BY_HANDLE_32: {
+                struct xfs_fsop_handlereq       hreq;
-        error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg);
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
+                        return -XFS_ERROR(EFAULT);
+                return xfs_open_by_handle(mp, &hreq, filp, inode);
-        return error;
+        }
-}
+        case XFS_IOC_READLINK_BY_HANDLE_32: {
+                struct xfs_fsop_handlereq       hreq;
-long
-xfs_file_compat_ioctl(
-        struct file             *file,
-        unsigned                cmd,
-        unsigned long           arg)
-{
-        return xfs_compat_ioctl(0, file, cmd, arg);
-}
-long
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
-xfs_file_compat_invis_ioctl(
+                        return -XFS_ERROR(EFAULT);
-        struct file             *file,
+                return xfs_readlink_by_handle(mp, &hreq, inode);
-        unsigned                cmd,
+        }
-        unsigned long           arg)
+        case XFS_IOC_ATTRLIST_BY_HANDLE_32:
-{
+                return xfs_compat_attrlist_by_handle(mp, arg, inode);
-        return xfs_compat_ioctl(IO_INVIS, file, cmd, arg);
+        case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+                return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+        case XFS_IOC_FSSETDM_BY_HANDLE_32:
+                return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+        default:
+                return -XFS_ERROR(ENOIOCTLCMD);
+        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 02de6e62ee3..1024c4f8ba0 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -18,7 +18,217 @@
 #ifndef __XFS_IOCTL32_H__
 #define __XFS_IOCTL32_H__
-extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long);
+#include <linux/compat.h>
-extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long);
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32    FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32    FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32   FS_IOC32_GETVERSION
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+typedef struct compat_xfs_bstime {
+        compat_time_t   tv_sec;         /* seconds              */
+        __s32           tv_nsec;        /* and nanoseconds      */
+} compat_xfs_bstime_t;
+typedef struct compat_xfs_bstat {
+        __u64           bs_ino;         /* inode number                 */
+        __u16           bs_mode;        /* type and mode                */
+        __u16           bs_nlink;       /* number of links              */
+        __u32           bs_uid;         /* user id                      */
+        __u32           bs_gid;         /* group id                     */
+        __u32           bs_rdev;        /* device value                 */
+        __s32           bs_blksize;     /* block size                   */
+        __s64           bs_size;        /* file size                    */
+        compat_xfs_bstime_t bs_atime;   /* access time                  */
+        compat_xfs_bstime_t bs_mtime;   /* modify time                  */
+        compat_xfs_bstime_t bs_ctime;   /* inode change time            */
+        int64_t         bs_blocks;      /* number of blocks             */
+        __u32           bs_xflags;      /* extended flags               */
+        __s32           bs_extsize;     /* extent size                  */
+        __s32           bs_extents;     /* number of extents            */
+        __u32           bs_gen;         /* generation count             */
+        __u16           bs_projid;      /* project id                   */
+        unsigned char   bs_pad[14];     /* pad space, unused            */
+        __u32           bs_dmevmask;    /* DMIG event mask              */
+        __u16           bs_dmstate;     /* DMIG state info              */
+        __u16           bs_aextents;    /* attribute number of extents  */
+} __compat_packed compat_xfs_bstat_t;
+typedef struct compat_xfs_fsop_bulkreq {
+        compat_uptr_t   lastip;         /* last inode # pointer         */
+        __s32           icount;         /* count of entries in buffer   */
+        compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+        compat_uptr_t   ocount;         /* output count pointer         */
+} compat_xfs_fsop_bulkreq_t;
+#define XFS_IOC_FSBULKSTAT_32 \
+        _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+        _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+        _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+typedef struct compat_xfs_fsop_handlereq {
+        __u32           fd;             /* fd for FD_TO_HANDLE          */
+        compat_uptr_t   path;           /* user pathname                */
+        __u32           oflags;         /* open flags                   */
+        compat_uptr_t   ihandle;        /* user supplied handle         */
+        __u32           ihandlen;       /* user supplied length         */
+        compat_uptr_t   ohandle;        /* user buffer for handle       */
+        compat_uptr_t   ohandlen;       /* user buffer length           */
+} compat_xfs_fsop_handlereq_t;
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+        _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+        _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+        _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+        _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+        _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+        __int64_t               sx_version;     /* version */
+        __int64_t               sx_fdtarget;    /* fd of target file */
+        __int64_t               sx_fdtmp;       /* fd of tmp file */
+        xfs_off_t               sx_offset;      /* offset into file */
+        xfs_off_t               sx_length;      /* leng from offset */
+        char                    sx_pad[16];     /* pad space, unused */
+        compat_xfs_bstat_t      sx_stat;        /* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+#define XFS_IOC_SWAPEXT_32      _IOWR('X', 109, struct compat_xfs_swapext)
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+        struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+        struct xfs_attrlist_cursor      pos; /* opaque cookie, list offset */
+        __u32                           flags;  /* which namespace to use */
+        __u32                           buflen; /* length of buffer supplied */
+        compat_uptr_t                   buffer; /* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+        _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+        __u32           am_opcode;
+        __s32           am_error;
+        compat_uptr_t   am_attrname;
+        compat_uptr_t   am_attrvalue;
+        __u32           am_length;
+        __u32           am_flags;
+} compat_xfs_attr_multiop_t;
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+        struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+        __u32                           opcount;/* count of following multiop */
+        /* ptr to compat_xfs_attr_multiop */
+        compat_uptr_t                   ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+        _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+typedef struct compat_xfs_fsop_setdm_handlereq {
+        struct compat_xfs_fsop_handlereq hreq;  /* handle information   */
+        /* ptr to struct fsdmidata */
+        compat_uptr_t                   data;   /* DMAPI data   */
+} compat_xfs_fsop_setdm_handlereq_t;
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+        _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start __attribute__((packed));
+                        /* len == 0 means until end of file */
+        __s64           l_len __attribute__((packed));
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area */
+} compat_xfs_flock64_t;
+#define XFS_IOC_ALLOCSP_32      _IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32       _IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32    _IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32     _IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32       _IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32     _IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32     _IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32   _IOW('X', 43, struct compat_xfs_flock64)
+typedef struct compat_xfs_fsop_geom_v1 {
+        __u32           blocksize;      /* filesystem (data) block size */
+        __u32           rtextsize;      /* realtime extent size         */
+        __u32           agblocks;       /* fsblocks in an AG            */
+        __u32           agcount;        /* number of allocation groups  */
+        __u32           logblocks;      /* fsblocks in the log          */
+        __u32           sectsize;       /* (data) sector size, bytes    */
+        __u32           inodesize;      /* inode size in bytes          */
+        __u32           imaxpct;        /* max allowed inode space(%)   */
+        __u64           datablocks;     /* fsblocks in data subvolume   */
+        __u64           rtblocks;       /* fsblocks in realtime subvol  */
+        __u64           rtextents;      /* rt extents in realtime subvol*/
+        __u64           logstart;       /* starting fsblock of the log  */
+        unsigned char   uuid[16];       /* unique id of the filesystem  */
+        __u32           sunit;          /* stripe unit, fsblocks        */
+        __u32           swidth;         /* stripe width, fsblocks       */
+        __s32           version;        /* structure version            */
+        __u32           flags;          /* superblock version flags     */
+        __u32           logsectsize;    /* log sector size, bytes       */
+        __u32           rtsectsize;     /* realtime sector size, bytes  */
+        __u32           dirblocksize;   /* directory block size, bytes  */
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+        _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+typedef struct compat_xfs_inogrp {
+        __u64           xi_startino;    /* starting inode number        */
+        __s32           xi_alloccount;  /* # bits set in allocmask      */
+        __u64           xi_allocmask;   /* mask of allocated inodes     */
+} __attribute__((packed)) compat_xfs_inogrp_t;
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+        __u64           newblocks;      /* new data subvol size, fsblocks */
+        __u32           imaxpct;        /* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+typedef struct compat_xfs_growfs_rt {
+        __u64           newblocks;      /* new realtime size, fsblocks */
+        __u32           extsize;        /* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+#endif /* BROKEN_X86_ALIGNMENT */
 #endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 095d271f343..7aa53fefc67 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -53,6 +53,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
+#include <linux/fiemap.h>
 /*
 * Bring the atime in the XFS inode uptodate.
@@ -64,14 +65,14 @@ xfs_synchronize_atime(
 {
        struct inode    *inode = VFS_I(ip);
-        if (inode) {
+        if (!(inode->i_state & I_CLEAR)) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
                ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
        }
 }
 /*
- * If the linux inode exists, mark it dirty.
+ * If the linux inode is valid, mark it dirty.
 * Used when commiting a dirty inode into a transaction so that
 * the inode will get written back by the linux code
 */
@@ -81,7 +82,7 @@ xfs_mark_inode_dirty_sync(
 {
        struct inode    *inode = VFS_I(ip);
-        if (inode)
+        if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
                mark_inode_dirty_sync(inode);
 }
@@ -128,7 +129,7 @@ xfs_ichgtime(
        if (sync_it) {
                SYNCHRONIZE();
                ip->i_update_core = 1;
-                mark_inode_dirty_sync(inode);
+                xfs_mark_inode_dirty_sync(ip);
        }
 }
@@ -158,8 +159,6 @@ xfs_init_security(
        }
        error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-        if (!error)
-                xfs_iflags_set(ip, XFS_IMODIFIED);
        kfree(name);
        kfree(value);
@@ -260,7 +259,6 @@ xfs_vn_mknod(
                error = _ACL_INHERIT(inode, mode, default_acl);
                if (unlikely(error))
                        goto out_cleanup_inode;
-                xfs_iflags_set(ip, XFS_IMODIFIED);
                _ACL_FREE(default_acl);
        }
@@ -366,21 +364,17 @@ xfs_vn_link(
        struct inode    *dir,
        struct dentry   *dentry)
 {
-        struct inode    *inode; /* inode of guy being linked to */
+        struct inode    *inode = old_dentry->d_inode;
        struct xfs_name name;
        int             error;
-        inode = old_dentry->d_inode;
        xfs_dentry_to_name(&name, dentry);
-        igrab(inode);
        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
-        if (unlikely(error)) {
+        if (unlikely(error))
-                iput(inode);
                return -error;
-        }
-        xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+        atomic_inc(&inode->i_count);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -601,7 +595,7 @@ xfs_vn_setattr(
        struct dentry   *dentry,
        struct iattr    *iattr)
 {
-        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
+        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 /*
@@ -642,7 +636,7 @@ xfs_vn_fallocate(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                      0, NULL, XFS_ATTR_NOLOCK);
+                                      0, XFS_ATTR_NOLOCK);
        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
            offset + len > i_size_read(inode))
                new_size = offset + len;
@@ -653,7 +647,7 @@ xfs_vn_fallocate(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
-                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
+                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -661,6 +655,88 @@ out_error:
        return error;
 }
+#define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+        void                    **arg,
+        struct getbmapx         *bmv,
+        int                     *full)
+{
+        int                     error;
+        struct fiemap_extent_info *fieinfo = *arg;
+        u32                     fiemap_flags = 0;
+        u64                     logical, physical, length;
+        /* Do nothing for a hole */
+        if (bmv->bmv_block == -1LL)
+                return 0;
+        logical = BBTOB(bmv->bmv_offset);
+        physical = BBTOB(bmv->bmv_block);
+        length = BBTOB(bmv->bmv_length);
+        if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+                fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+        else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+                fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+                physical = 0;   /* no block yet */
+        }
+        if (bmv->bmv_oflags & BMV_OF_LAST)
+                fiemap_flags |= FIEMAP_EXTENT_LAST;
+        error = fiemap_fill_next_extent(fieinfo, logical, physical,
+                                        length, fiemap_flags);
+        if (error > 0) {
+                error = 0;
+                *full = 1;      /* user array now full */
+        }
+        return -error;
+}
+STATIC int
+xfs_vn_fiemap(
+        struct inode            *inode,
+        struct fiemap_extent_info *fieinfo,
+        u64                     start,
+        u64                     length)
+{
+        xfs_inode_t             *ip = XFS_I(inode);
+        struct getbmapx         bm;
+        int                     error;
+        error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+        if (error)
+                return error;
+        /* Set up bmap header for xfs internal routine */
+        bm.bmv_offset = BTOBB(start);
+        /* Special case for whole file */
+        if (length == FIEMAP_MAX_OFFSET)
+                bm.bmv_length = -1LL;
+        else
+                bm.bmv_length = BTOBB(length);
+        /* our formatter will tell xfs_getbmap when to stop. */
+        bm.bmv_count = MAXEXTNUM;
+        bm.bmv_iflags = BMV_IF_PREALLOC;
+        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+                bm.bmv_iflags |= BMV_IF_ATTRFORK;
+        if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+                bm.bmv_iflags |= BMV_IF_DELALLOC;
+        error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+        if (error)
+                return -error;
+        return 0;
+}
 static const struct inode_operations xfs_inode_operations = {
        .permission             = xfs_vn_permission,
        .truncate               = xfs_vn_truncate,
@@ -671,6 +747,7 @@ static const struct inode_operations xfs_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
        .fallocate              = xfs_vn_fallocate,
+        .fiemap                 = xfs_vn_fiemap,
 };
 static const struct inode_operations xfs_dir_inode_operations = {
@@ -766,12 +843,20 @@ xfs_diflags_to_iflags(
 * When reading existing inodes from disk this is called directly
 * from xfs_iget, when creating a new inode it is called from
 * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
 */
 void
 xfs_setup_inode(
        struct xfs_inode        *ip)
 {
-        struct inode            *inode = ip->i_vnode;
+        struct inode            *inode = &ip->i_vnode;
+        inode->i_ino = ip->i_ino;
+        inode->i_state = I_NEW|I_LOCK;
+        inode_add_to_lists(ip->i_mount->m_super, inode);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
@@ -799,7 +884,6 @@ xfs_setup_inode(
        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
        xfs_diflags_to_iflags(inode, ip);
-        xfs_iflags_clear(ip, XFS_IMODIFIED);
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 8b1a1e31dc2..ef41c92ce66 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,7 +22,6 @@ struct xfs_inode;
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
-extern const struct file_operations xfs_invis_file_operations;
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index cc0f7b3a979..507492d6dcc 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -21,18 +21,12 @@
 #include <linux/types.h>
 /*
- * Some types are conditional depending on the target system.
 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- * as requiring XFS_BIG_BLKNOS to be set.
 */
 #if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS 1
-# if BITS_PER_LONG == 64
+# define XFS_BIG_INUMS  1
-#  define XFS_BIG_INUMS 1
-# else
-#  define XFS_BIG_INUMS 0
-# endif
 #else
 # define XFS_BIG_BLKNOS 0
 # define XFS_BIG_INUMS  0
@@ -77,6 +71,7 @@
 #include <linux/spinlock.h>
 #include <linux/random.h>
 #include <linux/ctype.h>
+#include <linux/writeback.h>
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -85,7 +80,6 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_vfs.h>
 #include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
@@ -107,7 +101,6 @@
 #undef  HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
 #endif
-#define restricted_chown        xfs_params.restrict_chown.val
 #define irix_sgid_inherit       xfs_params.sgid_inherit.val
 #define irix_symlink_mode       xfs_params.symlink_mode.val
 #define xfs_panic_mask          xfs_params.panic_mask.val
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1957e5357d0..7e90daa0d1d 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,7 +51,6 @@
 #include "xfs_vnodeops.h"
 #include <linux/capability.h>
-#include <linux/mount.h>
 #include <linux/writeback.h>
@@ -243,7 +242,7 @@ xfs_read(
        if (unlikely(ioflags & IO_ISDIRECT)) {
                if (inode->i_mapping->nrpages)
-                        ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
+                        ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
                                                    -1, FI_REMAPF_LOCKED);
                mutex_unlock(&inode->i_mutex);
                if (ret) {
@@ -668,15 +667,8 @@ start:
        if (new_size > xip->i_size)
                xip->i_new_size = new_size;
-        /*
+        if (likely(!(ioflags & IO_INVIS)))
-         * We're not supposed to change timestamps in readonly-mounted
-         * filesystems.  Throw it away if anyone asks us.
-         */
-        if (likely(!(ioflags & IO_INVIS) &&
-                   !mnt_want_write(file->f_path.mnt))) {
                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                mnt_drop_write(file->f_path.mnt);
-        }
        /*
         * If the offset is beyond the size of the file, we have a couple
@@ -715,7 +707,6 @@ start:
                }
        }
-retry:
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -771,6 +762,17 @@ retry:
        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
                ret = wait_on_sync_kiocb(iocb);
+        isize = i_size_read(inode);
+        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
+                *offset = isize;
+        if (*offset > xip->i_size) {
+                xfs_ilock(xip, XFS_ILOCK_EXCL);
+                if (*offset > xip->i_size)
+                        xip->i_size = *offset;
+                xfs_iunlock(xip, XFS_ILOCK_EXCL);
+        }
        if (ret == -ENOSPC &&
            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
                xfs_iunlock(xip, iolock);
@@ -784,20 +786,7 @@ retry:
                xfs_ilock(xip, iolock);
                if (error)
                        goto out_unlock_internal;
-                pos = xip->i_size;
+                goto start;
-                ret = 0;
-                goto retry;
-        }
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
-                *offset = isize;
-        if (*offset > xip->i_size) {
-                xfs_ilock(xip, XFS_ILOCK_EXCL);
-                if (*offset > xip->i_size)
-                        xip->i_size = *offset;
-                xfs_iunlock(xip, XFS_ILOCK_EXCL);
        }
        error = -ret;
@@ -855,13 +844,7 @@ retry:
 int
 xfs_bdstrat_cb(struct xfs_buf *bp)
 {
-        xfs_mount_t     *mp;
+        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-        mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
-        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                xfs_buf_iorequest(bp);
-                return 0;
-        } else {
                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
                /*
                 * Metadata write that didn't get logged but
@@ -874,6 +857,9 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
                else
                        return (xfs_bioerror(bp));
        }
+        xfs_buf_iorequest(bp);
+        return 0;
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 3d5b67c075c..c3526d445f6 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -53,11 +53,15 @@ xfs_read_xfsstats(
                { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
                { "vnodes",             XFSSTAT_END_VNODE_OPS           },
                { "buf",                XFSSTAT_END_BUF                 },
+                { "abtb2",              XFSSTAT_END_ABTB_V2             },
+                { "abtc2",              XFSSTAT_END_ABTC_V2             },
+                { "bmbt2",              XFSSTAT_END_BMBT_V2             },
+                { "ibt2",               XFSSTAT_END_IBT_V2              },
        };
        /* Loop over all stats groups */
        for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
-                len += sprintf(buffer + len, xstats[i].desc);
+                len += sprintf(buffer + len, "%s", xstats[i].desc);
                /* inner loop does each group */
                while (j < xstats[i].endpoint) {
                        val = 0;
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index e83820febc9..736854b1ca1 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -118,6 +118,71 @@ struct xfsstats {
        __uint32_t              xb_page_retries;
        __uint32_t              xb_page_found;
        __uint32_t              xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2             (XFSSTAT_END_BUF+15)
+        __uint32_t              xs_abtb_2_lookup;
+        __uint32_t              xs_abtb_2_compare;
+        __uint32_t              xs_abtb_2_insrec;
+        __uint32_t              xs_abtb_2_delrec;
+        __uint32_t              xs_abtb_2_newroot;
+        __uint32_t              xs_abtb_2_killroot;
+        __uint32_t              xs_abtb_2_increment;
+        __uint32_t              xs_abtb_2_decrement;
+        __uint32_t              xs_abtb_2_lshift;
+        __uint32_t              xs_abtb_2_rshift;
+        __uint32_t              xs_abtb_2_split;
+        __uint32_t              xs_abtb_2_join;
+        __uint32_t              xs_abtb_2_alloc;
+        __uint32_t              xs_abtb_2_free;
+        __uint32_t              xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2             (XFSSTAT_END_ABTB_V2+15)
+        __uint32_t              xs_abtc_2_lookup;
+        __uint32_t              xs_abtc_2_compare;
+        __uint32_t              xs_abtc_2_insrec;
+        __uint32_t              xs_abtc_2_delrec;
+        __uint32_t              xs_abtc_2_newroot;
+        __uint32_t              xs_abtc_2_killroot;
+        __uint32_t              xs_abtc_2_increment;
+        __uint32_t              xs_abtc_2_decrement;
+        __uint32_t              xs_abtc_2_lshift;
+        __uint32_t              xs_abtc_2_rshift;
+        __uint32_t              xs_abtc_2_split;
+        __uint32_t              xs_abtc_2_join;
+        __uint32_t              xs_abtc_2_alloc;
+        __uint32_t              xs_abtc_2_free;
+        __uint32_t              xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2             (XFSSTAT_END_ABTC_V2+15)
+        __uint32_t              xs_bmbt_2_lookup;
+        __uint32_t              xs_bmbt_2_compare;
+        __uint32_t              xs_bmbt_2_insrec;
+        __uint32_t              xs_bmbt_2_delrec;
+        __uint32_t              xs_bmbt_2_newroot;
+        __uint32_t              xs_bmbt_2_killroot;
+        __uint32_t              xs_bmbt_2_increment;
+        __uint32_t              xs_bmbt_2_decrement;
+        __uint32_t              xs_bmbt_2_lshift;
+        __uint32_t              xs_bmbt_2_rshift;
+        __uint32_t              xs_bmbt_2_split;
+        __uint32_t              xs_bmbt_2_join;
+        __uint32_t              xs_bmbt_2_alloc;
+        __uint32_t              xs_bmbt_2_free;
+        __uint32_t              xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2              (XFSSTAT_END_BMBT_V2+15)
+        __uint32_t              xs_ibt_2_lookup;
+        __uint32_t              xs_ibt_2_compare;
+        __uint32_t              xs_ibt_2_insrec;
+        __uint32_t              xs_ibt_2_delrec;
+        __uint32_t              xs_ibt_2_newroot;
+        __uint32_t              xs_ibt_2_killroot;
+        __uint32_t              xs_ibt_2_increment;
+        __uint32_t              xs_ibt_2_decrement;
+        __uint32_t              xs_ibt_2_lshift;
+        __uint32_t              xs_ibt_2_rshift;
+        __uint32_t              xs_ibt_2_split;
+        __uint32_t              xs_ibt_2_join;
+        __uint32_t              xs_ibt_2_alloc;
+        __uint32_t              xs_ibt_2_free;
+        __uint32_t              xs_ibt_2_moves;
 /* Extra precision counters */
        __uint64_t              xs_xstrat_bytes;
        __uint64_t              xs_write_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 37ebe36056e..36f6cc703ef 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -18,7 +18,6 @@
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
-#include "xfs_clnt.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -36,6 +35,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -48,7 +48,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
 #include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
@@ -58,6 +57,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
+#include "xfs_sync.h"
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -70,36 +70,9 @@
 static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_vnode_zone;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
-        struct super_block      *sb,
-        int                     silent)
-{
-        struct xfs_mount_args   *args;
-        args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
-        if (!args)
-                return NULL;
-        args->logbufs = args->logbufsize = -1;
-        strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-        /* Copy the already-parsed mount(2) flags we're interested in */
-        if (sb->s_flags & MS_DIRSYNC)
-                args->flags |= XFSMNT_DIRSYNC;
-        if (sb->s_flags & MS_SYNCHRONOUS)
-                args->flags |= XFSMNT_WSYNC;
-        if (silent)
-                args->flags |= XFSMNT_QUIET;
-        args->flags |= XFSMNT_32BITINODES;
-        return args;
-}
 #define MNTOPT_LOGBUFS  "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE "logbsize"      /* size of XFS log buffers */
 #define MNTOPT_LOGDEV   "logdev"        /* log device */
@@ -188,26 +161,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
        return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
        char                    *options,
-        struct xfs_mount_args   *args,
+        char                    **mtpt)
-        int                     update)
 {
+        struct super_block      *sb = mp->m_super;
        char                    *this_char, *value, *eov;
-        int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
+        int                     dsunit = 0;
-        int                     iosize;
+        int                     dswidth = 0;
+        int                     iosize = 0;
        int                     dmapi_implies_ikeep = 1;
+        uchar_t                 iosizelog = 0;
+        /*
+         * Copy binary VFS mount flags we are interested in.
+         */
+        if (sb->s_flags & MS_RDONLY)
+                mp->m_flags |= XFS_MOUNT_RDONLY;
+        if (sb->s_flags & MS_DIRSYNC)
+                mp->m_flags |= XFS_MOUNT_DIRSYNC;
+        if (sb->s_flags & MS_SYNCHRONOUS)
+                mp->m_flags |= XFS_MOUNT_WSYNC;
+        /*
+         * Set some default flags that could be cleared by the mount option
+         * parsing.
+         */
+        mp->m_flags |= XFS_MOUNT_BARRIER;
+        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-        args->flags |= XFSMNT_BARRIER;
+        /*
-        args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+         * These can be overridden by the mount option parsing.
+         */
+        mp->m_logbufs = -1;
+        mp->m_logbsize = -1;
        if (!options)
                goto done;
-        iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
        while ((this_char = strsep(&options, ",")) != NULL) {
                if (!*this_char)
                        continue;
@@ -221,7 +222,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        args->logbufs = simple_strtoul(value, &eov, 10);
+                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -229,7 +230,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        args->logbufsize = suffix_strtoul(value, &eov, 10);
+                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -237,7 +238,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->logname, value, MAXNAMELEN);
+                        mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!mp->m_logname)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -245,7 +248,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->mtpt, value, MAXNAMELEN);
+                        *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!*mtpt)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -253,7 +258,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->rtname, value, MAXNAMELEN);
+                        mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!mp->m_rtname)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -262,8 +269,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = simple_strtoul(value, &eov, 10);
-                        args->flags |= XFSMNT_IOSIZE;
+                        iosizelog = ffs(iosize) - 1;
-                        args->iosizelog = (uint8_t) iosize;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -272,8 +278,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = suffix_strtoul(value, &eov, 10);
-                        args->flags |= XFSMNT_IOSIZE;
+                        iosizelog = ffs(iosize) - 1;
-                        args->iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
                        mp->m_flags |= XFS_MOUNT_GRPID;
@@ -281,23 +286,25 @@ xfs_parseargs(
                           !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
-                        args->flags |= XFSMNT_WSYNC;
+                        mp->m_flags |= XFS_MOUNT_WSYNC;
                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-                        args->flags |= XFSMNT_OSYNCISOSYNC;
+                        mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
-                        args->flags |= XFSMNT_NORECOVERY;
+                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
                } else if (!strcmp(this_char, MNTOPT_INO64)) {
-                        args->flags |= XFSMNT_INO64;
+#if XFS_BIG_INUMS
-#if !XFS_BIG_INUMS
+                        mp->m_flags |= XFS_MOUNT_INO64;
+                        mp->m_inoadd = XFS_INO64_OFFSET;
+#else
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
-                        args->flags |= XFSMNT_NOALIGN;
+                        mp->m_flags |= XFS_MOUNT_NOALIGN;
                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
-                        args->flags |= XFSMNT_SWALLOC;
+                        mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -315,7 +322,7 @@ xfs_parseargs(
                        }
                        dswidth = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
-                        args->flags &= ~XFSMNT_32BITINODES;
+                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
@@ -323,56 +330,61 @@ xfs_parseargs(
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
-                        args->flags |= XFSMNT_NOUUID;
+                        mp->m_flags |= XFS_MOUNT_NOUUID;
                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
-                        args->flags |= XFSMNT_BARRIER;
+                        mp->m_flags |= XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
-                        args->flags &= ~XFSMNT_BARRIER;
+                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
-                        args->flags |= XFSMNT_IKEEP;
+                        mp->m_flags |= XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
                        dmapi_implies_ikeep = 0;
-                        args->flags &= ~XFSMNT_IKEEP;
+                        mp->m_flags &= ~XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
-                        args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE;
+                        mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
-                        args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+                        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
-                        args->flags |= XFSMNT_ATTR2;
+                        mp->m_flags |= XFS_MOUNT_ATTR2;
                } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
-                        args->flags &= ~XFSMNT_ATTR2;
+                        mp->m_flags &= ~XFS_MOUNT_ATTR2;
-                        args->flags |= XFSMNT_NOATTR2;
+                        mp->m_flags |= XFS_MOUNT_NOATTR2;
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
-                        args->flags2 |= XFSMNT2_FILESTREAMS;
+                        mp->m_flags |= XFS_MOUNT_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                        args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
+                        mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
-                        args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
+                                          XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                          XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                          XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
                           !strcmp(this_char, MNTOPT_UQUOTA) ||
                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
-                        args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                         XFS_UQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
                           !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
-                        args->flags |= XFSMNT_UQUOTA;
+                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_UQUOTAENF;
+                        mp->m_qflags &= ~XFS_UQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
-                        args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
+                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                         XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
-                        args->flags |= XFSMNT_PQUOTA;
+                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_PQUOTAENF;
+                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
-                        args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                         XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
-                        args->flags |= XFSMNT_GQUOTA;
+                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_GQUOTAENF;
+                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_XDSM)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -390,27 +402,29 @@ xfs_parseargs(
                }
        }
-        if (args->flags & XFSMNT_NORECOVERY) {
+        /*
-                if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) {
+         * no recovery flag requires a read-only mount
-                        cmn_err(CE_WARN,
+         */
-                                "XFS: no-recovery mounts must be read-only.");
+        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
-                        return EINVAL;
+            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                }
+                cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+                return EINVAL;
        }
-        if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
                cmn_err(CE_WARN,
        "XFS: sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
-        if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
+        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
                cmn_err(CE_WARN,
                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
-        if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') {
+        if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
                printk("XFS: %s option needs the mount point option as well\n",
                        MNTOPT_DMAPI);
                return EINVAL;
@@ -438,27 +452,66 @@ xfs_parseargs(
         * Note that if "ikeep" or "noikeep" mount options are
         * supplied, then they are honored.
         */
-        if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep)
+        if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
-                args->flags |= XFSMNT_IKEEP;
+                mp->m_flags |= XFS_MOUNT_IKEEP;
-        if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+done:
+        if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+                /*
+                 * At this point the superblock has not been read
+                 * in, therefore we do not know the block size.
+                 * Before the mount call ends we will convert
+                 * these to FSBs.
+                 */
                if (dsunit) {
-                        args->sunit = dsunit;
+                        mp->m_dalign = dsunit;
-                        args->flags |= XFSMNT_RETERR;
+                        mp->m_flags |= XFS_MOUNT_RETERR;
-                } else {
-                        args->sunit = vol_dsunit;
                }
-                dswidth ? (args->swidth = dswidth) :
-                          (args->swidth = vol_dswidth);
+                if (dswidth)
-        } else {
+                        mp->m_swidth = dswidth;
-                args->sunit = args->swidth = 0;
+        }
+        if (mp->m_logbufs != -1 &&
+            mp->m_logbufs != 0 &&
+            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+                cmn_err(CE_WARN,
+                        "XFS: invalid logbufs value: %d [not %d-%d]",
+                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+                return XFS_ERROR(EINVAL);
+        }
+        if (mp->m_logbsize != -1 &&
+            mp->m_logbsize !=  0 &&
+            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+             !is_power_of_2(mp->m_logbsize))) {
+                cmn_err(CE_WARN,
+        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        mp->m_logbsize);
+                return XFS_ERROR(EINVAL);
+        }
+        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+        if (!mp->m_fsname)
+                return ENOMEM;
+        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+        if (iosizelog) {
+                if (iosizelog > XFS_MAX_IO_LOG ||
+                    iosizelog < XFS_MIN_IO_LOG) {
+                        cmn_err(CE_WARN,
+                "XFS: invalid log iosize: %d [not %d-%d]",
+                                iosizelog, XFS_MIN_IO_LOG,
+                                XFS_MAX_IO_LOG);
+                        return XFS_ERROR(EINVAL);
+                }
+                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+                mp->m_readio_log = iosizelog;
+                mp->m_writeio_log = iosizelog;
        }
-done:
-        if (args->flags & XFSMNT_32BITINODES)
-                mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-        if (args->flags2)
-                args->flags |= XFSMNT_FLAGS2;
        return 0;
 }
@@ -704,8 +757,7 @@ xfs_close_devices(
 */
 STATIC int
 xfs_open_devices(
-        struct xfs_mount        *mp,
+        struct xfs_mount        *mp)
-        struct xfs_mount_args   *args)
 {
        struct block_device     *ddev = mp->m_super->s_bdev;
        struct block_device     *logdev = NULL, *rtdev = NULL;
@@ -714,14 +766,14 @@ xfs_open_devices(
        /*
         * Open real time and log devices - order is important.
         */
-        if (args->logname[0]) {
+        if (mp->m_logname) {
-                error = xfs_blkdev_get(mp, args->logname, &logdev);
+                error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
                if (error)
                        goto out;
        }
-        if (args->rtname[0]) {
+        if (mp->m_rtname) {
-                error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+                error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
                if (error)
                        goto out_close_logdev;
@@ -813,18 +865,18 @@ xfs_setup_devices(
 */
 void
 xfsaild_wakeup(
-        xfs_mount_t             *mp,
+        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        mp->m_ail.xa_target = threshold_lsn;
+        ailp->xa_target = threshold_lsn;
-        wake_up_process(mp->m_ail.xa_task);
+        wake_up_process(ailp->xa_task);
 }
 int
 xfsaild(
        void    *data)
 {
-        xfs_mount_t     *mp = (xfs_mount_t *)data;
+        struct xfs_ail  *ailp = data;
        xfs_lsn_t       last_pushed_lsn = 0;
        long            tout = 0;
@@ -836,11 +888,11 @@ xfsaild(
                /* swsusp */
                try_to_freeze();
-                ASSERT(mp->m_log);
+                ASSERT(ailp->xa_mount->m_log);
-                if (XFS_FORCED_SHUTDOWN(mp))
+                if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
                        continue;
-                tout = xfsaild_push(mp, &last_pushed_lsn);
+                tout = xfsaild_push(ailp, &last_pushed_lsn);
        }
        return 0;
@@ -848,43 +900,82 @@ xfsaild(
 int
 xfsaild_start(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
-        mp->m_ail.xa_target = 0;
+        ailp->xa_target = 0;
-        mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild");
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
-        if (IS_ERR(mp->m_ail.xa_task))
+        if (IS_ERR(ailp->xa_task))
-                return -PTR_ERR(mp->m_ail.xa_task);
+                return -PTR_ERR(ailp->xa_task);
        return 0;
 }
 void
 xfsaild_stop(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
-        kthread_stop(mp->m_ail.xa_task);
+        kthread_stop(ailp->xa_task);
 }
+/* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-        return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
+        BUG();
+        return NULL;
 }
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
 STATIC void
 xfs_fs_destroy_inode(
-        struct inode            *inode)
+        struct inode    *inode)
 {
-        kmem_zone_free(xfs_vnode_zone, inode);
+        xfs_inode_t             *ip = XFS_I(inode);
+        XFS_STATS_INC(vn_reclaim);
+        if (xfs_reclaim(ip))
+                panic("%s: cannot reclaim 0x%p\n", __func__, inode);
 }
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly intialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
 STATIC void
 xfs_fs_inode_init_once(
-        void                    *vnode)
+        void                    *inode)
 {
-        inode_init_once((struct inode *)vnode);
+        struct xfs_inode        *ip = inode;
+        memset(ip, 0, sizeof(struct xfs_inode));
+        /* vfs inode */
+        inode_init_once(VFS_I(ip));
+        /* xfs inode */
+        atomic_set(&ip->i_iocount, 0);
+        atomic_set(&ip->i_pincount, 0);
+        spin_lock_init(&ip->i_flags_lock);
+        init_waitqueue_head(&ip->i_ipin_wait);
+        /*
+         * Because we want to use a counting completion, complete
+         * the flush completion once to allow a single access to
+         * the flush completion without blocking.
+         */
+        init_completion(&ip->i_flush);
+        complete(&ip->i_flush);
+        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                     "xfsino", ip->i_ino);
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 }
 /*
@@ -898,21 +989,26 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
+        struct xfs_inode        *ip = XFS_I(inode);
        int                     error = 0;
        int                     flags = 0;
-        xfs_itrace_entry(XFS_I(inode));
+        xfs_itrace_entry(ip);
        if (sync) {
-                filemap_fdatawait(inode->i_mapping);
+                error = xfs_wait_on_pages(ip, 0, -1);
+                if (error)
+                        goto out_error;
                flags |= FLUSH_SYNC;
        }
-        error = xfs_inode_flush(XFS_I(inode), flags);
+        error = xfs_inode_flush(ip, flags);
+out_error:
        /*
         * if we failed to write out the inode then mark
         * it dirty again so we'll try again later.
         */
        if (error)
-                mark_inode_dirty_sync(inode);
+                xfs_mark_inode_dirty_sync(ip);
        return -error;
 }
@@ -923,164 +1019,12 @@ xfs_fs_clear_inode(
 {
        xfs_inode_t             *ip = XFS_I(inode);
-        /*
+        xfs_itrace_entry(ip);
-         * ip can be null when xfs_iget_core calls xfs_idestroy if we
+        XFS_STATS_INC(vn_rele);
-         * find an inode with di_mode == 0 but without IGET_CREATE set.
+        XFS_STATS_INC(vn_remove);
-         */
+        XFS_STATS_DEC(vn_active);
-        if (ip) {
-                xfs_itrace_entry(ip);
-                XFS_STATS_INC(vn_rele);
-                XFS_STATS_INC(vn_remove);
-                XFS_STATS_INC(vn_reclaim);
-                XFS_STATS_DEC(vn_active);
-                xfs_inactive(ip);
-                xfs_iflags_clear(ip, XFS_IMODIFIED);
-                if (xfs_reclaim(ip))
-                        panic("%s: cannot reclaim 0x%p\n", __func__, inode);
-        }
-        ASSERT(XFS_I(inode) == NULL);
-}
-/*
+        xfs_inactive(ip);
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *))
-{
-        struct bhv_vfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        filemap_flush(inode->i_mapping);
-        iput(inode);
-}
-void
-xfs_flush_inode(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-        delay(msecs_to_jiffies(500));
-}
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        sync_blockdev(mp->m_super->s_bdev);
-        iput(inode);
-}
-void
-xfs_flush_device(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
-        delay(msecs_to_jiffies(500));
-        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-STATIC void
-xfs_sync_worker(
-        struct xfs_mount *mp,
-        void            *unused)
-{
-        int             error;
-        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-                error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
-}
-STATIC int
-xfssyncd(
-        void                    *arg)
-{
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        bhv_vfs_sync_work_t     *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
-                /*
-                 * We can get woken by laptop mode, to do a sync -
-                 * that's the (only!) case where the list would be
-                 * empty with time remaining.
-                 */
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
-                        list_move(&work->w_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
-                        (*work->w_syncer)(mp, work->w_data);
-                        list_del(&work->w_list);
-                        if (work == &mp->m_sync_work)
-                                continue;
-                        kmem_free(work);
-                }
-        }
-        return 0;
 }
 STATIC void
@@ -1099,11 +1043,9 @@ xfs_fs_put_super(
        struct xfs_mount        *mp = XFS_M(sb);
        struct xfs_inode        *rip = mp->m_rootip;
        int                     unmount_event_flags = 0;
-        int                     error;
-        kthread_stop(mp->m_sync_task);
+        xfs_syncd_stop(mp);
+        xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
-        xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1128,18 +1070,6 @@ xfs_fs_put_super(
        xfs_filestream_unmount(mp);
        XFS_bflush(mp->m_ddev_targp);
-        error = xfs_unmount_flush(mp, 0);
-        WARN_ON(error);
-        /*
-         * If we're forcing a shutdown, typically because of a media error,
-         * we want to make sure we invalidate dirty pages that belong to
-         * referenced vnodes as well.
-         */
-        if (XFS_FORCED_SHUTDOWN(mp)) {
-                error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
-                ASSERT(error != EFSCORRUPTED);
-        }
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
@@ -1161,7 +1091,7 @@ xfs_fs_write_super(
        struct super_block      *sb)
 {
        if (!(sb->s_flags & MS_RDONLY))
-                xfs_sync(XFS_M(sb), SYNC_FSDATA);
+                xfs_sync_fsdata(XFS_M(sb), 0);
        sb->s_dirt = 0;
 }
@@ -1172,7 +1102,6 @@ xfs_fs_sync_super(
 {
        struct xfs_mount        *mp = XFS_M(sb);
        int                     error;
-        int                     flags;
        /*
         * Treat a sync operation like a freeze.  This is to work
@@ -1186,20 +1115,10 @@ xfs_fs_sync_super(
         * dirty the Linux inode until after the transaction I/O
         * completes.
         */
-        if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) {
+        if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
-                /*
+                error = xfs_quiesce_data(mp);
-                 * First stage of freeze - no more writers will make progress
+        else
-                 * now we are here, so we flush delwri and delalloc buffers
+                error = xfs_sync_fsdata(mp, 0);
-                 * here, then wait for all I/O to complete.  Data is frozen at
-                 * that point. Metadata is not frozen, transactions can still
-                 * occur here so don't bother flushing the buftarg (i.e
-                 * SYNC_QUIESCE) because it'll just get dirty again.
-                 */
-                flags = SYNC_DATA_QUIESCE;
-        } else
-                flags = SYNC_FSDATA;
-        error = xfs_sync(mp, flags);
        sb->s_dirt = 0;
        if (unlikely(laptop_mode)) {
@@ -1337,9 +1256,8 @@ xfs_fs_remount(
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
-                xfs_filestream_flush(mp);
+                xfs_quiesce_data(mp);
-                xfs_sync(mp, SYNC_DATA_QUIESCE);
+                xfs_quiesce_attr(mp);
-                xfs_attr_quiesce(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
@@ -1348,7 +1266,7 @@ xfs_fs_remount(
 /*
 * Second stage of a freeze. The data is already frozen so we only
- * need to take care of themetadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done write a dummy
 * record to dirty the log in case of a crash while frozen.
 */
 STATIC void
@@ -1357,7 +1275,7 @@ xfs_fs_lockfs(
 {
        struct xfs_mount        *mp = XFS_M(sb);
-        xfs_attr_quiesce(mp);
+        xfs_quiesce_attr(mp);
        xfs_fs_log_dummy(mp);
 }
@@ -1422,175 +1340,28 @@ xfs_fs_setxquota(
 /*
 * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
-        struct xfs_mount_args   *ap,
-        struct xfs_mount        *mp)
-{
-        int                     error;
-        /* Values are in BBs */
-        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
-                /*
-                 * At this point the superblock has not been read
-                 * in, therefore we do not know the block size.
-                 * Before the mount call ends we will convert
-                 * these to FSBs.
-                 */
-                mp->m_dalign = ap->sunit;
-                mp->m_swidth = ap->swidth;
-        }
-        if (ap->logbufs != -1 &&
-            ap->logbufs != 0 &&
-            (ap->logbufs < XLOG_MIN_ICLOGS ||
-             ap->logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
-                        ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-                return XFS_ERROR(EINVAL);
-        }
-        mp->m_logbufs = ap->logbufs;
-        if (ap->logbufsize != -1 &&
-            ap->logbufsize !=  0 &&
-            (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
-             ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
-             !is_power_of_2(ap->logbufsize))) {
-                cmn_err(CE_WARN,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-                        ap->logbufsize);
-                return XFS_ERROR(EINVAL);
-        }
-        error = ENOMEM;
-        mp->m_logbsize = ap->logbufsize;
-        mp->m_fsname_len = strlen(ap->fsname) + 1;
-        mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
-        if (!mp->m_fsname)
-                goto out;
-        if (ap->rtname[0]) {
-                mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
-                if (!mp->m_rtname)
-                        goto out_free_fsname;
-        }
-        if (ap->logname[0]) {
-                mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
-                if (!mp->m_logname)
-                        goto out_free_rtname;
-        }
-        if (ap->flags & XFSMNT_WSYNC)
-                mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
-        if (ap->flags & XFSMNT_INO64) {
-                mp->m_flags |= XFS_MOUNT_INO64;
-                mp->m_inoadd = XFS_INO64_OFFSET;
-        }
-#endif
-        if (ap->flags & XFSMNT_RETERR)
-                mp->m_flags |= XFS_MOUNT_RETERR;
-        if (ap->flags & XFSMNT_NOALIGN)
-                mp->m_flags |= XFS_MOUNT_NOALIGN;
-        if (ap->flags & XFSMNT_SWALLOC)
-                mp->m_flags |= XFS_MOUNT_SWALLOC;
-        if (ap->flags & XFSMNT_OSYNCISOSYNC)
-                mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
-        if (ap->flags & XFSMNT_32BITINODES)
-                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        if (ap->flags & XFSMNT_IOSIZE) {
-                if (ap->iosizelog > XFS_MAX_IO_LOG ||
-                    ap->iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
-                "XFS: invalid log iosize: %d [not %d-%d]",
-                                ap->iosizelog, XFS_MIN_IO_LOG,
-                                XFS_MAX_IO_LOG);
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-                mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
-        }
-        if (ap->flags & XFSMNT_IKEEP)
-                mp->m_flags |= XFS_MOUNT_IKEEP;
-        if (ap->flags & XFSMNT_DIRSYNC)
-                mp->m_flags |= XFS_MOUNT_DIRSYNC;
-        if (ap->flags & XFSMNT_ATTR2)
-                mp->m_flags |= XFS_MOUNT_ATTR2;
-        if (ap->flags & XFSMNT_NOATTR2)
-                mp->m_flags |= XFS_MOUNT_NOATTR2;
-        if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
-                mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-        /*
-         * no recovery flag requires a read-only mount
-         */
-        if (ap->flags & XFSMNT_NORECOVERY) {
-                if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                        cmn_err(CE_WARN,
-        "XFS: tried to mount a FS read-write without recovery!");
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_NORECOVERY;
-        }
-        if (ap->flags & XFSMNT_NOUUID)
-                mp->m_flags |= XFS_MOUNT_NOUUID;
-        if (ap->flags & XFSMNT_BARRIER)
-                mp->m_flags |= XFS_MOUNT_BARRIER;
-        else
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-        if (ap->flags2 & XFSMNT2_FILESTREAMS)
-                mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-        if (ap->flags & XFSMNT_DMAPI)
-                mp->m_flags |= XFS_MOUNT_DMAPI;
-        return 0;
- out_free_rtname:
-        kfree(mp->m_rtname);
- out_free_fsname:
-        kfree(mp->m_fsname);
- out:
-        return error;
-}
-/*
- * This function fills in xfs_mount_t fields based on mount args.
 * Note: the superblock _has_ now been read in.
 */
 STATIC int
 xfs_finish_flags(
-        struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
        /* Fail a mount where the logbuf is smaller then the log stripe */
        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-                if ((ap->logbufsize <= 0) &&
+                if (mp->m_logbsize <= 0 &&
-                    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+                    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
-                } else if (ap->logbufsize > 0 &&
+                } else if (mp->m_logbsize > 0 &&
-                           ap->logbufsize < mp->m_sb.sb_logsunit) {
+                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
-                if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
@@ -1602,7 +1373,7 @@ xfs_finish_flags(
         * told by noattr2 to turn it off
         */
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
-            !(ap->flags & XFSMNT_NOATTR2))
+            !(mp->m_flags & XFS_MOUNT_NOATTR2))
                mp->m_flags |= XFS_MOUNT_ATTR2;
        /*
@@ -1614,48 +1385,6 @@ xfs_finish_flags(
                return XFS_ERROR(EROFS);
        }
-        /*
-         * check for shared mount.
-         */
-        if (ap->flags & XFSMNT_SHARED) {
-                if (!xfs_sb_version_hasshared(&mp->m_sb))
-                        return XFS_ERROR(EINVAL);
-                /*
-                 * For IRIX 6.5, shared mounts must have the shared
-                 * version bit set, have the persistent readonly
-                 * field set, must be version 0 and can only be mounted
-                 * read-only.
-                 */
-                if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
-                     (mp->m_sb.sb_shared_vn != 0))
-                        return XFS_ERROR(EINVAL);
-                mp->m_flags |= XFS_MOUNT_SHARED;
-                /*
-                 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
-                 */
-                if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
-                        return XFS_ERROR(EINVAL);
-        }
-        if (ap->flags & XFSMNT_UQUOTA) {
-                mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_UQUOTAENF)
-                        mp->m_qflags |= XFS_UQUOTA_ENFD;
-        }
-        if (ap->flags & XFSMNT_GQUOTA) {
-                mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_GQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        } else if (ap->flags & XFSMNT_PQUOTA) {
-                mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_PQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        }
        return 0;
 }
@@ -1667,19 +1396,14 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-        struct xfs_mount_args   *args;
        int                     flags = 0, error = ENOMEM;
+        char                    *mtpt = NULL;
-        args = xfs_args_allocate(sb, silent);
-        if (!args)
-                return -ENOMEM;
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
-                goto out_free_args;
+                goto out;
        spin_lock_init(&mp->m_sb_lock);
-        mutex_init(&mp->m_ilock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
        INIT_LIST_HEAD(&mp->m_sync_list);
@@ -1689,12 +1413,9 @@ xfs_fs_fill_super(
        mp->m_super = sb;
        sb->s_fs_info = mp;
-        if (sb->s_flags & MS_RDONLY)
+        error = xfs_parseargs(mp, (char *)data, &mtpt);
-                mp->m_flags |= XFS_MOUNT_RDONLY;
-        error = xfs_parseargs(mp, (char *)data, args, 0);
        if (error)
-                goto out_free_mp;
+                goto out_free_fsname;
        sb_min_blocksize(sb, BBSIZE);
        sb->s_xattr = xfs_xattr_handlers;
@@ -1702,33 +1423,28 @@ xfs_fs_fill_super(
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_op = &xfs_super_operations;
-        error = xfs_dmops_get(mp, args);
+        error = xfs_dmops_get(mp);
        if (error)
-                goto out_free_mp;
+                goto out_free_fsname;
-        error = xfs_qmops_get(mp, args);
+        error = xfs_qmops_get(mp);
        if (error)
                goto out_put_dmops;
-        if (args->flags & XFSMNT_QUIET)
+        if (silent)
                flags |= XFS_MFSI_QUIET;
-        error = xfs_open_devices(mp, args);
+        error = xfs_open_devices(mp);
        if (error)
                goto out_put_qmops;
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
-        /*
-         * Setup flags based on mount(2) options and then the superblock
-         */
-        error = xfs_start_flags(args, mp);
-        if (error)
-                goto out_free_fsname;
        error = xfs_readsb(mp, flags);
        if (error)
-                goto out_free_fsname;
+                goto out_destroy_counters;
-        error = xfs_finish_flags(args, mp);
+        error = xfs_finish_flags(mp);
        if (error)
                goto out_free_sb;
@@ -1747,7 +1463,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_filestream_unmount;
-        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
+        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
@@ -1772,35 +1488,31 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        error = xfs_syncd_init(mp);
-        mp->m_sync_work.w_mount = mp;
+        if (error)
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
-        if (IS_ERR(mp->m_sync_task)) {
-                error = -PTR_ERR(mp->m_sync_task);
                goto fail_vnrele;
-        }
-        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
+        kfree(mtpt);
-        kfree(args);
+        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
        return 0;
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
        xfs_freesb(mp);
- out_free_fsname:
+ out_destroy_counters:
-        xfs_free_fsname(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
 out_put_qmops:
        xfs_qmops_put(mp);
 out_put_dmops:
        xfs_dmops_put(mp);
- out_free_mp:
+ out_free_fsname:
+        xfs_free_fsname(mp);
+        kfree(mtpt);
        kfree(mp);
- out_free_args:
+ out:
-        kfree(args);
        return -error;
 fail_vnrele:
@@ -1820,8 +1532,6 @@ xfs_fs_fill_super(
        xfs_filestream_unmount(mp);
        XFS_bflush(mp->m_ddev_targp);
-        error = xfs_unmount_flush(mp, 0);
-        WARN_ON(error);
        xfs_unmountfs(mp);
        goto out_free_sb;
@@ -1882,10 +1592,19 @@ xfs_alloc_trace_bufs(void)
        if (!xfs_bmap_trace_buf)
                goto out_free_alloc_trace;
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
+        xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
+                                             KM_MAYFAIL);
+        if (!xfs_allocbt_trace_buf)
+                goto out_free_bmap_trace;
+        xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_inobt_trace_buf)
+                goto out_free_allocbt_trace;
        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
        if (!xfs_bmbt_trace_buf)
-                goto out_free_bmap_trace;
+                goto out_free_inobt_trace;
 #endif
 #ifdef XFS_ATTR_TRACE
        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
@@ -1907,8 +1626,12 @@ xfs_alloc_trace_bufs(void)
        ktrace_free(xfs_attr_trace_buf);
 out_free_bmbt_trace:
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+ out_free_inobt_trace:
+        ktrace_free(xfs_inobt_trace_buf);
+ out_free_allocbt_trace:
+        ktrace_free(xfs_allocbt_trace_buf);
 out_free_bmap_trace:
 #endif
 #ifdef XFS_BMAP_TRACE
@@ -1931,8 +1654,10 @@ xfs_free_trace_bufs(void)
 #ifdef XFS_ATTR_TRACE
        ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+        ktrace_free(xfs_inobt_trace_buf);
+        ktrace_free(xfs_allocbt_trace_buf);
 #endif
 #ifdef XFS_BMAP_TRACE
        ktrace_free(xfs_bmap_trace_buf);
@@ -1945,16 +1670,10 @@ xfs_free_trace_bufs(void)
 STATIC int __init
 xfs_init_zones(void)
 {
-        xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                        KM_ZONE_SPREAD,
-                                        xfs_fs_inode_init_once);
-        if (!xfs_vnode_zone)
-                goto out;
        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
        if (!xfs_ioend_zone)
-                goto out_destroy_vnode_zone;
+                goto out;
        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
                                                  xfs_ioend_zone);
@@ -1970,6 +1689,7 @@ xfs_init_zones(void)
                                                "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
                                                "xfs_btree_cur");
        if (!xfs_btree_cur_zone)
@@ -2017,8 +1737,8 @@ xfs_init_zones(void)
        xfs_inode_zone =
                kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-                                        KM_ZONE_SPREAD, NULL);
+                        xfs_fs_inode_init_once);
        if (!xfs_inode_zone)
                goto out_destroy_efi_zone;
@@ -2066,8 +1786,6 @@ xfs_init_zones(void)
        mempool_destroy(xfs_ioend_pool);
 out_destroy_ioend_zone:
        kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-        kmem_zone_destroy(xfs_vnode_zone);
 out:
        return -ENOMEM;
 }
@@ -2092,7 +1810,6 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_log_ticket_zone);
        mempool_destroy(xfs_ioend_pool);
        kmem_zone_destroy(xfs_ioend_zone);
-        kmem_zone_destroy(xfs_vnode_zone);
 }
@@ -2100,13 +1817,12 @@ STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
-        static char             message[] __initdata = KERN_INFO \
-                XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
-        printk(message);
+        printk(KERN_INFO XFS_VERSION_STRING " with "
+                         XFS_BUILD_OPTIONS " enabled\n");
        ktrace_init(64);
-        vn_init();
+        xfs_ioend_init();
        xfs_dir_startup();
        error = xfs_init_zones();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index fe2ef4e6a0f..d5d776d4cd6 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -20,24 +20,12 @@
 #include <linux/exportfs.h>
-#ifdef CONFIG_XFS_DMAPI
-# define vfs_insertdmapi(vfs)   vfs_insertops(vfsp, &xfs_dmops)
-# define vfs_initdmapi()        dmapi_init()
-# define vfs_exitdmapi()        dmapi_uninit()
-#else
-# define vfs_insertdmapi(vfs)   do { } while (0)
-# define vfs_initdmapi()        do { } while (0)
-# define vfs_exitdmapi()        do { } while (0)
-#endif
 #ifdef CONFIG_XFS_QUOTA
-# define vfs_insertquota(vfs)   vfs_insertops(vfsp, &xfs_qmops)
 extern void xfs_qm_init(void);
 extern void xfs_qm_exit(void);
 # define vfs_initquota()        xfs_qm_init()
 # define vfs_exitquota()        xfs_qm_exit()
 #else
-# define vfs_insertquota(vfs)   do { } while (0)
 # define vfs_initquota()        do { } while (0)
 # define vfs_exitquota()        do { } while (0)
 #endif
@@ -101,9 +89,6 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_flush_inode(struct xfs_inode *);
-extern void xfs_flush_device(struct xfs_inode *);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644
index 00000000000..2ed035354c2
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_utils.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_rw.h"
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+/*
+ * Sync all the inodes in the given AG according to the
+ * direction given by the flags.
+ */
+STATIC int
+xfs_sync_inodes_ag(
+        xfs_mount_t     *mp,
+        int             ag,
+        int             flags)
+{
+        xfs_perag_t     *pag = &mp->m_perag[ag];
+        int             nr_found;
+        uint32_t        first_index = 0;
+        int             error = 0;
+        int             last_error = 0;
+        int             fflag = XFS_B_ASYNC;
+        if (flags & SYNC_DELWRI)
+                fflag = XFS_B_DELWRI;
+        if (flags & SYNC_WAIT)
+                fflag = 0;              /* synchronous overrides all */
+        do {
+                struct inode    *inode;
+                xfs_inode_t     *ip = NULL;
+                int             lock_flags = XFS_ILOCK_SHARED;
+                /*
+                 * use a gang lookup to find the next inode in the tree
+                 * as the tree is sparse and a gang lookup walks to find
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                (void**)&ip, first_index, 1);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* nothing to sync during shutdown */
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        return 0;
+                }
+                /*
+                 * If we can't get a reference on the inode, it must be
+                 * in reclaim. Leave it for the reclaim code to flush.
+                 */
+                inode = VFS_I(ip);
+                if (!igrab(inode)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        continue;
+                }
+                read_unlock(&pag->pag_ici_lock);
+                /* avoid new or bad inodes */
+                if (is_bad_inode(inode) ||
+                    xfs_iflags_test(ip, XFS_INEW)) {
+                        IRELE(ip);
+                        continue;
+                }
+                /*
+                 * If we have to flush data or wait for I/O completion
+                 * we need to hold the iolock.
+                 */
+                if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                        lock_flags |= XFS_IOLOCK_SHARED;
+                        error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                        if (flags & SYNC_IOWAIT)
+                                xfs_ioend_wait(ip);
+                }
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
+                        if (flags & SYNC_WAIT) {
+                                xfs_iflock(ip);
+                                if (!xfs_inode_clean(ip))
+                                        error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                                else
+                                        xfs_ifunlock(ip);
+                        } else if (xfs_iflock_nowait(ip)) {
+                                if (!xfs_inode_clean(ip))
+                                        error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+                                else
+                                        xfs_ifunlock(ip);
+                        }
+                }
+                xfs_iput(ip, lock_flags);
+                if (error)
+                        last_error = error;
+                /*
+                 * bail out if the filesystem is corrupted.
+                 */
+                if (error == EFSCORRUPTED)
+                        return XFS_ERROR(error);
+        } while (nr_found);
+        return last_error;
+}
+int
+xfs_sync_inodes(
+        xfs_mount_t     *mp,
+        int             flags)
+{
+        int             error;
+        int             last_error;
+        int             i;
+        int             lflags = XFS_LOG_FORCE;
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return 0;
+        error = 0;
+        last_error = 0;
+        if (flags & SYNC_WAIT)
+                lflags |= XFS_LOG_SYNC;
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                error = xfs_sync_inodes_ag(mp, i, flags);
+                if (error)
+                        last_error = error;
+                if (error == EFSCORRUPTED)
+                        break;
+        }
+        if (flags & SYNC_DELWRI)
+                xfs_log_force(mp, 0, lflags);
+        return XFS_ERROR(last_error);
+}
+STATIC int
+xfs_commit_dummy_trans(
+        struct xfs_mount        *mp,
+        uint                    log_flags)
+{
+        struct xfs_inode        *ip = mp->m_rootip;
+        struct xfs_trans        *tp;
+        int                     error;
+        /*
+         * Put a dummy transaction in the log to tell recovery
+         * that all others are OK.
+         */
+        tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        /* XXX(hch): ignoring the error here.. */
+        error = xfs_trans_commit(tp, 0);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_log_force(mp, 0, log_flags);
+        return 0;
+}
+int
+xfs_sync_fsdata(
+        struct xfs_mount        *mp,
+        int                     flags)
+{
+        struct xfs_buf          *bp;
+        struct xfs_buf_log_item *bip;
+        int                     error = 0;
+        /*
+         * If this is xfssyncd() then only sync the superblock if we can
+         * lock it without sleeping and it is not pinned.
+         */
+        if (flags & SYNC_BDFLUSH) {
+                ASSERT(!(flags & SYNC_WAIT));
+                bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+                if (!bp)
+                        goto out;
+                bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
+                if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
+                        goto out_brelse;
+        } else {
+                bp = xfs_getsb(mp, 0);
+                /*
+                 * If the buffer is pinned then push on the log so we won't
+                 * get stuck waiting in the write for someone, maybe
+                 * ourselves, to flush the log.
+                 *
+                 * Even though we just pushed the log above, we did not have
+                 * the superblock buffer locked at that point so it can
+                 * become pinned in between there and here.
+                 */
+                if (XFS_BUF_ISPINNED(bp))
+                        xfs_log_force(mp, 0, XFS_LOG_FORCE);
+        }
+        if (flags & SYNC_WAIT)
+                XFS_BUF_UNASYNC(bp);
+        else
+                XFS_BUF_ASYNC(bp);
+        return xfs_bwrite(mp, bp);
+ out_brelse:
+        xfs_buf_relse(bp);
+ out:
+        return error;
+}
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+        struct xfs_mount        *mp)
+{
+        int error;
+        /* push non-blocking */
+        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+        XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+        xfs_filestream_flush(mp);
+        /* push and block */
+        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+        XFS_QM_DQSYNC(mp, SYNC_WAIT);
+        /* write superblock and hoover up shutdown errors */
+        error = xfs_sync_fsdata(mp, 0);
+        /* flush data-only devices */
+        if (mp->m_rtdev_targp)
+                XFS_bflush(mp->m_rtdev_targp);
+        return error;
+}
+STATIC void
+xfs_quiesce_fs(
+        struct xfs_mount        *mp)
+{
+        int     count = 0, pincount;
+        xfs_flush_buftarg(mp->m_ddev_targp, 0);
+        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+        /*
+         * This loop must run at least twice.  The first instance of the loop
+         * will flush most meta data but that will generate more meta data
+         * (typically directory updates).  Which then must be flushed and
+         * logged before we can write the unmount record.
+         */
+        do {
+                xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+                if (!pincount) {
+                        delay(50);
+                        count++;
+                }
+        } while (count < 2);
+}
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceding.
+ */
+void
+xfs_quiesce_attr(
+        struct xfs_mount        *mp)
+{
+        int     error = 0;
+        /* wait for all modifications to complete */
+        while (atomic_read(&mp->m_active_trans) > 0)
+                delay(100);
+        /* flush inodes and push all remaining buffers out to disk */
+        xfs_quiesce_fs(mp);
+        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+        /* Push the superblock and write an unmount record */
+        error = xfs_log_sbcount(mp, 1);
+        if (error)
+                xfs_fs_cmn_err(CE_WARN, mp,
+                                "xfs_attr_quiesce: failed to log sb changes. "
+                                "Frozen image may not be consistent.");
+        xfs_log_unmount_write(mp);
+        xfs_unmountfs_writesb(mp);
+}
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+        struct xfs_mount *mp,
+        void            *data,
+        void            (*syncer)(struct xfs_mount *, void *))
+{
+        struct bhv_vfs_sync_work *work;
+        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+        INIT_LIST_HEAD(&work->w_list);
+        work->w_syncer = syncer;
+        work->w_data = data;
+        work->w_mount = mp;
+        spin_lock(&mp->m_sync_lock);
+        list_add_tail(&work->w_list, &mp->m_sync_list);
+        spin_unlock(&mp->m_sync_lock);
+        wake_up_process(mp->m_sync_task);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+        struct xfs_mount *mp,
+        void            *arg)
+{
+        struct inode    *inode = arg;
+        filemap_flush(inode->i_mapping);
+        iput(inode);
+}
+void
+xfs_flush_inode(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = VFS_I(ip);
+        igrab(inode);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
+        delay(msecs_to_jiffies(500));
+}
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+        struct xfs_mount *mp,
+        void            *arg)
+{
+        struct inode    *inode = arg;
+        sync_blockdev(mp->m_super->s_bdev);
+        iput(inode);
+}
+void
+xfs_flush_device(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = VFS_I(ip);
+        igrab(inode);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+        delay(msecs_to_jiffies(500));
+        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
+/*
+ * Every sync period we need to unpin all items, reclaim inodes, sync
+ * quota and write out the superblock. We might need to cover the log
+ * to indicate it is idle.
+ */
+STATIC void
+xfs_sync_worker(
+        struct xfs_mount *mp,
+        void            *unused)
+{
+        int             error;
+        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                /* dgc: errors ignored here */
+                error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+                error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+                if (xfs_log_need_covered(mp))
+                        error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
+        }
+        mp->m_sync_seq++;
+        wake_up(&mp->m_wait_single_sync_task);
+}
+STATIC int
+xfssyncd(
+        void                    *arg)
+{
+        struct xfs_mount        *mp = arg;
+        long                    timeleft;
+        bhv_vfs_sync_work_t     *work, *n;
+        LIST_HEAD               (tmp);
+        set_freezable();
+        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
+        for (;;) {
+                timeleft = schedule_timeout_interruptible(timeleft);
+                /* swsusp */
+                try_to_freeze();
+                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
+                        break;
+                spin_lock(&mp->m_sync_lock);
+                /*
+                 * We can get woken by laptop mode, to do a sync -
+                 * that's the (only!) case where the list would be
+                 * empty with time remaining.
+                 */
+                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                        if (!timeleft)
+                                timeleft = xfs_syncd_centisecs *
+                                                        msecs_to_jiffies(10);
+                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
+                        list_add_tail(&mp->m_sync_work.w_list,
+                                        &mp->m_sync_list);
+                }
+                list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+                        list_move(&work->w_list, &tmp);
+                spin_unlock(&mp->m_sync_lock);
+                list_for_each_entry_safe(work, n, &tmp, w_list) {
+                        (*work->w_syncer)(mp, work->w_data);
+                        list_del(&work->w_list);
+                        if (work == &mp->m_sync_work)
+                                continue;
+                        kmem_free(work);
+                }
+        }
+        return 0;
+}
+int
+xfs_syncd_init(
+        struct xfs_mount        *mp)
+{
+        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        mp->m_sync_work.w_mount = mp;
+        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+        if (IS_ERR(mp->m_sync_task))
+                return -PTR_ERR(mp->m_sync_task);
+        return 0;
+}
+void
+xfs_syncd_stop(
+        struct xfs_mount        *mp)
+{
+        kthread_stop(mp->m_sync_task);
+}
+int
+xfs_reclaim_inode(
+        xfs_inode_t     *ip,
+        int             locked,
+        int             sync_mode)
+{
+        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+        /* The hash lock here protects a thread in xfs_iget_core from
+         * racing with us on linking the inode back with a vnode.
+         * Once we have the XFS_IRECLAIM flag set it will not touch
+         * us.
+         */
+        write_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+            !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+                spin_unlock(&ip->i_flags_lock);
+                write_unlock(&pag->pag_ici_lock);
+                if (locked) {
+                        xfs_ifunlock(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+                return 1;
+        }
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        write_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(ip->i_mount, pag);
+        /*
+         * If the inode is still dirty, then flush it out.  If the inode
+         * is not in the AIL, then it will be OK to flush it delwri as
+         * long as xfs_iflush() does not keep any references to the inode.
+         * We leave that decision up to xfs_iflush() since it has the
+         * knowledge of whether it's OK to simply do a delwri flush of
+         * the inode or whether we need to wait until the inode is
+         * pulled from the AIL.
+         * We get the flush lock regardless, though, just to make sure
+         * we don't free it while it is being flushed.
+         */
+        if (!locked) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_iflock(ip);
+        }
+        /*
+         * In the case of a forced shutdown we rely on xfs_iflush() to
+         * wait for the inode to be unpinned before returning an error.
+         */
+        if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+                /* synchronize with xfs_iflush_done */
+                xfs_iflock(ip);
+                xfs_ifunlock(ip);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_ireclaim(ip);
+        return 0;
+}
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+        read_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        radix_tree_tag_set(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+        spin_unlock(&ip->i_flags_lock);
+        read_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(mp, pag);
+}
+void
+__xfs_inode_clear_reclaim_tag(
+        xfs_mount_t     *mp,
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+}
+void
+xfs_inode_clear_reclaim_tag(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+        read_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+        spin_unlock(&ip->i_flags_lock);
+        read_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(mp, pag);
+}
+STATIC void
+xfs_reclaim_inodes_ag(
+        xfs_mount_t     *mp,
+        int             ag,
+        int             noblock,
+        int             mode)
+{
+        xfs_inode_t     *ip = NULL;
+        xfs_perag_t     *pag = &mp->m_perag[ag];
+        int             nr_found;
+        uint32_t        first_index;
+        int             skipped;
+restart:
+        first_index = 0;
+        skipped = 0;
+        do {
+                /*
+                 * use a gang lookup to find the next inode in the tree
+                 * as the tree is sparse and a gang lookup walks to find
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                        (void**)&ip, first_index, 1,
+                                        XFS_ICI_RECLAIM_TAG);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* ignore if already under reclaim */
+                if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        continue;
+                }
+                if (noblock) {
+                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                                read_unlock(&pag->pag_ici_lock);
+                                continue;
+                        }
+                        if (xfs_ipincount(ip) ||
+                            !xfs_iflock_nowait(ip)) {
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                read_unlock(&pag->pag_ici_lock);
+                                continue;
+                        }
+                }
+                read_unlock(&pag->pag_ici_lock);
+                /*
+                 * hmmm - this is an inode already in reclaim. Do
+                 * we even bother catching it here?
+                 */
+                if (xfs_reclaim_inode(ip, noblock, mode))
+                        skipped++;
+        } while (nr_found);
+        if (skipped) {
+                delay(1);
+                goto restart;
+        }
+        return;
+}
+int
+xfs_reclaim_inodes(
+        xfs_mount_t     *mp,
+        int              noblock,
+        int             mode)
+{
+        int             i;
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644
index 00000000000..5f6de1efe1f
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+struct xfs_mount;
+typedef struct bhv_vfs_sync_work {
+        struct list_head        w_list;
+        struct xfs_mount        *w_mount;
+        void                    *w_data;        /* syncer routine argument */
+        void                    (*w_syncer)(struct xfs_mount *, void *);
+} bhv_vfs_sync_work_t;
+#define SYNC_ATTR               0x0001  /* sync attributes */
+#define SYNC_DELWRI             0x0002  /* look at delayed writes */
+#define SYNC_WAIT               0x0004  /* wait for i/o to complete */
+#define SYNC_BDFLUSH            0x0008  /* BDFLUSH is calling -- don't block */
+#define SYNC_IOWAIT             0x0010  /* wait for all I/O to complete */
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_device(struct xfs_inode *ip);
+int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+                                struct xfs_inode *ip);
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7dacb5bbde3..916c0ffb608 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -56,17 +56,6 @@ xfs_stats_clear_proc_handler(
 static ctl_table xfs_table[] = {
        {
-                .ctl_name       = XFS_RESTRICT_CHOWN,
-                .procname       = "restrict_chown",
-                .data           = &xfs_params.restrict_chown.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
-                .extra1         = &xfs_params.restrict_chown.min,
-                .extra2         = &xfs_params.restrict_chown.max
-        },
-        {
                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 4aadb8056c3..b9937d450f8 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val {
 } xfs_sysctl_val_t;
 typedef struct xfs_param {
-        xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
        xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
                                         * not a member of parent dir GID. */
        xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
@@ -68,7 +67,7 @@ typedef struct xfs_param {
 enum {
        /* XFS_REFCACHE_SIZE = 1 */
        /* XFS_REFCACHE_PURGE = 2 */
-        XFS_RESTRICT_CHOWN = 3,
+        /* XFS_RESTRICT_CHOWN = 3 */
        XFS_SGID_INHERIT = 4,
        XFS_SYMLINK_MODE = 5,
        XFS_PANIC_MASK = 6,
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644
index 7e60c7776b1..00000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VFS_H__
-#define __XFS_VFS_H__
-#include <linux/vfs.h>
-#include "xfs_fs.h"
-struct inode;
-struct fid;
-struct cred;
-struct seq_file;
-struct super_block;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_mount_args;
-typedef struct kstatfs  bhv_statvfs_t;
-typedef struct bhv_vfs_sync_work {
-        struct list_head        w_list;
-        struct xfs_mount        *w_mount;
-        void                    *w_data;        /* syncer routine argument */
-        void                    (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
-#define SYNC_ATTR               0x0001  /* sync attributes */
-#define SYNC_CLOSE              0x0002  /* close file system down */
-#define SYNC_DELWRI             0x0004  /* look at delayed writes */
-#define SYNC_WAIT               0x0008  /* wait for i/o to complete */
-#define SYNC_BDFLUSH            0x0010  /* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA             0x0020  /* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE           0x0040  /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT            0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_IOWAIT             0x0100  /* wait for all I/O to complete */
-/*
- * When remounting a filesystem read-only or freezing the filesystem,
- * we have two phases to execute. This first phase is syncing the data
- * before we quiesce the fielsystem, and the second is flushing all the
- * inodes out after we've waited for all the transactions created by
- * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
- * to ensure that the inodes are written to their location on disk
- * rather than just existing in transactions in the log. This means
- * after a quiesce there is no log replay required to write the inodes
- * to disk (this is the main difference between a sync and a quiesce).
- */
-#define SYNC_DATA_QUIESCE       (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
-#define SYNC_INODE_QUIESCE      (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
-#define SHUTDOWN_META_IO_ERROR  0x0001  /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR   0x0002  /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT   0x0004  /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008  /* corrupt in-memory data structures */
-#define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
-#define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
-#define xfs_test_for_freeze(mp)         ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l)       vfs_check_frozen((mp)->m_super, (l))
-#endif  /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644
index b52528bbbff..00000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-/*
- * And this gunk is needed for xfs_mount.h"
- */
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-/*
- * Dedicated vnode inactive/reclaim sync wait queues.
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC                  37
-#define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t vsync[NVSYNC];
-void __init
-vn_init(void)
-{
-        int i;
-        for (i = 0; i < NVSYNC; i++)
-                init_waitqueue_head(&vsync[i]);
-}
-void
-vn_iowait(
-        xfs_inode_t     *ip)
-{
-        wait_queue_head_t *wq = vptosync(ip);
-        wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-void
-vn_iowake(
-        xfs_inode_t     *ip)
-{
-        if (atomic_dec_and_test(&ip->i_iocount))
-                wake_up(vptosync(ip));
-}
-/*
- * Volume managers supporting multiple paths can send back ENODEV when the
- * final path disappears.  In this case continuing to fill the page cache
- * with dirty data which cannot be written out is evil, so prevent that.
- */
-void
-vn_ioerror(
-        xfs_inode_t     *ip,
-        int             error,
-        char            *f,
-        int             l)
-{
-        if (unlikely(error == -ENODEV))
-                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
-}
-#ifdef  XFS_INODE_TRACE
-/*
- * Reference count of Linux inode if present, -1 if the xfs_inode
- * has no associated Linux inode.
- */
-static inline int xfs_icount(struct xfs_inode *ip)
-{
-        struct inode *vp = VFS_I(ip);
-        if (vp)
-                return vn_count(vp);
-        return -1;
-}
-#define KTRACE_ENTER(ip, vk, s, line, ra)                       \
-        ktrace_enter(   (ip)->i_trace,                          \
-/*  0 */                (void *)(__psint_t)(vk),                \
-/*  1 */                (void *)(s),                            \
-/*  2 */                (void *)(__psint_t) line,               \
-/*  3 */                (void *)(__psint_t)xfs_icount(ip),      \
-/*  4 */                (void *)(ra),                           \
-/*  5 */                NULL,                                   \
-/*  6 */                (void *)(__psint_t)current_cpu(),       \
-/*  7 */                (void *)(__psint_t)current_pid(),       \
-/*  8 */                (void *)__return_address,               \
-/*  9 */                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-/*
- * Vnode tracing code.
- */
-void
-_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
-}
-void
-_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
-}
-void
-xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
-}
-void
-_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
-}
-void
-xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
-}
-#endif  /* XFS_INODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 683ce16210f..f65983a230d 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -18,7 +18,10 @@
 #ifndef __XFS_VNODE_H__
 #define __XFS_VNODE_H__
+#include "xfs_fs.h"
 struct file;
+struct xfs_inode;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
@@ -51,40 +54,6 @@ struct attrlist_cursor_kern;
                                           Prevent VM access to the pages until
                                           the operation completes. */
-extern void     vn_init(void);
-/*
- * Yeah, these don't take vnode anymore at all, all this should be
- * cleaned up at some point.
- */
-extern void     vn_iowait(struct xfs_inode *ip);
-extern void     vn_iowake(struct xfs_inode *ip);
-extern void     vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-static inline int vn_count(struct inode *vp)
-{
-        return atomic_read(&vp->i_count);
-}
-#define IHOLD(ip) \
-do { \
-        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
-        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-} while (0)
-#define IRELE(ip) \
-do { \
-        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-        iput(VFS_I(ip)); \
-} while (0)
-static inline struct inode *vn_grab(struct inode *vp)
-{
-        return igrab(vp);
-}
 /*
 * Dealing with bad inodes
 */
@@ -121,39 +90,4 @@ static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
                                        PAGECACHE_TAG_DIRTY)
-/*
- * Tracking vnode activity.
- */
-#if defined(XFS_INODE_TRACE)
-#define INODE_TRACE_SIZE        16              /* number of trace entries */
-#define INODE_KTRACE_ENTRY      1
-#define INODE_KTRACE_EXIT       2
-#define INODE_KTRACE_HOLD       3
-#define INODE_KTRACE_REF        4
-#define INODE_KTRACE_RELE       5
-extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
-extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
-extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
-extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
-extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
-#define xfs_itrace_entry(ip)    \
-        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit(ip)     \
-        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit_tag(ip, tag)    \
-        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
-#define xfs_itrace_ref(ip)      \
-        _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
-#else
-#define xfs_itrace_entry(a)
-#define xfs_itrace_exit(a)
-#define xfs_itrace_exit_tag(a, b)
-#define xfs_itrace_hold(a, b, c, d)
-#define xfs_itrace_ref(a)
-#define xfs_itrace_rele(a, b, c, d)
-#endif
 #endif  /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f2705f2fd43..591ca6602bf 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-                sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+                init_waitqueue_head(&dqp->q_pinwait);
                /*
                 * Because we want to use a counting completion, complete
@@ -131,7 +131,7 @@ xfs_qm_dqinit(
                 dqp->q_res_bcount = 0;
                 dqp->q_res_icount = 0;
                 dqp->q_res_rtbcount = 0;
-                 dqp->q_pincount = 0;
+                 atomic_set(&dqp->q_pincount, 0);
                 dqp->q_hash = NULL;
                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
@@ -1221,16 +1221,14 @@ xfs_qm_dqflush(
        xfs_dqtrace_entry(dqp, "DQFLUSH");
        /*
-         * If not dirty, nada.
+         * If not dirty, or it's pinned and we are not supposed to
+         * block, nada.
         */
-        if (!XFS_DQ_IS_DIRTY(dqp)) {
+        if (!XFS_DQ_IS_DIRTY(dqp) ||
+            (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
                xfs_dqfunlock(dqp);
-                return (0);
+                return 0;
        }
-        /*
-         * Cant flush a pinned dquot. Wait for it.
-         */
        xfs_qm_dqunpin_wait(dqp);
        /*
@@ -1274,10 +1272,8 @@ xfs_qm_dqflush(
        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
        mp = dqp->q_mount;
-        /* lsn is 64 bits */
+        xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
-        spin_lock(&mp->m_ail_lock);
+                                        &dqp->q_logitem.qli_item.li_lsn);
-        dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
-        spin_unlock(&mp->m_ail_lock);
        /*
         * Attach an iodone routine so that we can remove this dquot from the
@@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done(
        xfs_dq_logitem_t        *qip)
 {
        xfs_dquot_t             *dqp;
+        struct xfs_ail          *ailp;
        dqp = qip->qli_dquot;
+        ailp = qip->qli_item.li_ailp;
        /*
         * We only want to pull the item from the AIL if its
@@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done(
        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-                spin_lock(&dqp->q_mount->m_ail_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                /*
+                spin_lock(&ailp->xa_lock);
-                 * xfs_trans_delete_ail() drops the AIL lock.
-                 */
                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-                        xfs_trans_delete_ail(dqp->q_mount,
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
-                                             (xfs_log_item_t*)qip);
                else
-                        spin_unlock(&dqp->q_mount->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
        }
        /*
@@ -1375,7 +1370,7 @@ xfs_dqunlock(
        mutex_unlock(&(dqp->q_qlock));
        if (dqp->q_logitem.qli_dquot == dqp) {
                /* Once was dqp->q_mount, but might just have been cleared */
-                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
                                        (xfs_log_item_t*)&(dqp->q_logitem));
        }
 }
@@ -1489,7 +1484,7 @@ xfs_qm_dqpurge(
                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
                xfs_dqflock(dqp);
        }
-        ASSERT(dqp->q_pincount == 0);
+        ASSERT(atomic_read(&dqp->q_pincount) == 0);
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 8958d0faf8d..7e455337e2b 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -83,8 +83,8 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
        struct completion q_flush;      /* flush completion queue */
-        uint             q_pincount;    /* pin count for this dquot */
+        atomic_t          q_pincount;   /* dquot pin count */
-        sv_t             q_pinwait;     /* sync var for pinning */
+        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 #ifdef XFS_DQUOT_TRACE
        struct ktrace   *q_trace;       /* trace header structure */
 #endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index f028644caa5..1728f6a7c4f 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format(
 /*
 * Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
 */
 STATIC void
 xfs_qm_dquot_logitem_pin(
        xfs_dq_logitem_t *logitem)
 {
-        xfs_dquot_t *dqp;
+        xfs_dquot_t *dqp = logitem->qli_dquot;
-        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+        atomic_inc(&dqp->q_pincount);
-        dqp->q_pincount++;
-        spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
 }
 /*
 * Decrement the pin count of the given dquot, and wake up
 * anyone in xfs_dqwait_unpin() if the count goes to 0.  The
- * dquot must have been previously pinned with a call to xfs_dqpin().
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
 */
 /* ARGSUSED */
 STATIC void
@@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin(
        xfs_dq_logitem_t *logitem,
        int               stale)
 {
-        xfs_dquot_t *dqp;
+        xfs_dquot_t *dqp = logitem->qli_dquot;
-        dqp = logitem->qli_dquot;
+        ASSERT(atomic_read(&dqp->q_pincount) > 0);
-        ASSERT(dqp->q_pincount > 0);
+        if (atomic_dec_and_test(&dqp->q_pincount))
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+                wake_up(&dqp->q_pinwait);
-        dqp->q_pincount--;
-        if (dqp->q_pincount == 0) {
-                sv_broadcast(&dqp->q_pinwait);
-        }
-        spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
 }
 /* ARGSUSED */
@@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait(
        xfs_dquot_t     *dqp)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        if (dqp->q_pincount == 0) {
+        if (atomic_read(&dqp->q_pincount) == 0)
                return;
-        }
        /*
         * Give the log a push so we don't wait here too long.
         */
        xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+        wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
-        if (dqp->q_pincount == 0) {
-                spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-                return;
-        }
-        sv_wait(&(dqp->q_pinwait), PINOD,
-                &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
 }
 /*
@@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock(
        uint                    retval;
        dqp = qip->qli_dquot;
-        if (dqp->q_pincount > 0)
+        if (atomic_read(&dqp->q_pincount) > 0)
                return (XFS_ITEM_PINNED);
        if (! xfs_qm_dqlock_nowait(dqp))
@@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed(
        xfs_lsn_t lsn)
 {
        xfs_qoff_logitem_t      *qfs;
+        struct xfs_ail          *ailp;
        qfs = qfe->qql_start_lip;
-        spin_lock(&qfs->qql_item.li_mountp->m_ail_lock);
+        ailp = qfs->qql_item.li_ailp;
+        spin_lock(&ailp->xa_lock);
        /*
         * Delete the qoff-start logitem from the AIL.
-         * xfs_trans_delete_ail() drops the AIL lock.
+         * xfs_trans_ail_delete() drops the AIL lock.
         */
-        xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
+        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index df0ffef9775..6b13960cf31 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -396,13 +395,10 @@ xfs_qm_mount_quotas(
 /*
 * Called from the vfsops layer.
 */
-int
+void
 xfs_qm_unmount_quotas(
        xfs_mount_t     *mp)
 {
-        xfs_inode_t     *uqp, *gqp;
-        int             error = 0;
        /*
         * Release the dquots that root inode, et al might be holding,
         * before we flush quotas and blow away the quotainfo structure.
@@ -415,43 +411,18 @@ xfs_qm_unmount_quotas(
                xfs_qm_dqdetach(mp->m_rsumip);
        /*
-         * Flush out the quota inodes.
+         * Release the quota inodes.
         */
-        uqp = gqp = NULL;
        if (mp->m_quotainfo) {
-                if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
+                if (mp->m_quotainfo->qi_uquotaip) {
-                        xfs_ilock(uqp, XFS_ILOCK_EXCL);
+                        IRELE(mp->m_quotainfo->qi_uquotaip);
-                        xfs_iflock(uqp);
+                        mp->m_quotainfo->qi_uquotaip = NULL;
-                        error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
-                        xfs_iunlock(uqp, XFS_ILOCK_EXCL);
-                        if (unlikely(error == EFSCORRUPTED)) {
-                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                goto out;
-                        }
                }
-                if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
+                if (mp->m_quotainfo->qi_gquotaip) {
-                        xfs_ilock(gqp, XFS_ILOCK_EXCL);
+                        IRELE(mp->m_quotainfo->qi_gquotaip);
-                        xfs_iflock(gqp);
+                        mp->m_quotainfo->qi_gquotaip = NULL;
-                        error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
-                        xfs_iunlock(gqp, XFS_ILOCK_EXCL);
-                        if (unlikely(error == EFSCORRUPTED)) {
-                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                goto out;
-                        }
                }
        }
-        if (uqp) {
-                 IRELE(uqp);
-                 mp->m_quotainfo->qi_uquotaip = NULL;
-        }
-        if (gqp) {
-                IRELE(gqp);
-                mp->m_quotainfo->qi_gquotaip = NULL;
-        }
-out:
-        return XFS_ERROR(error);
 }
 /*
@@ -987,14 +958,10 @@ xfs_qm_dqdetach(
 }
 /*
- * This is called by VFS_SYNC and flags arg determines the caller,
+ * This is called to sync quotas. We can be told to use non-blocking
- * and its motives, as done in xfs_sync.
+ * semantics by either the SYNC_BDFLUSH flag or the absence of the
- *
+ * SYNC_WAIT flag.
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
 */
 int
 xfs_qm_sync(
        xfs_mount_t     *mp,
@@ -1137,7 +1104,6 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        spin_lock_init(&qinf->qi_pinlock);
        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
        qinf->qi_dqreclaims = 0;
@@ -1234,7 +1200,6 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
-        spinlock_destroy(&qi->qi_pinlock);
        xfs_qm_list_destroy(&qi->qi_dqlist);
        if (qi->qi_uquotaip) {
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 44f25349e47..ddf09166387 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -106,7 +106,6 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        spinlock_t       qi_pinlock;     /* dquot pinning lock */
        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
@@ -168,7 +167,7 @@ extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
+extern void             xfs_qm_unmount_quotas(xfs_mount_t *);
 extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 extern int              xfs_qm_sync(xfs_mount_t *, int);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index eea2e60b456..bc6c5cca3e1 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -51,7 +50,7 @@
 STATIC void
 xfs_fill_statvfs_from_dquot(
-        bhv_statvfs_t           *statp,
+        struct kstatfs          *statp,
        xfs_disk_dquot_t        *dp)
 {
        __uint64_t              limit;
@@ -88,7 +87,7 @@ xfs_fill_statvfs_from_dquot(
 STATIC void
 xfs_qm_statvfs(
        xfs_inode_t             *ip,
-        bhv_statvfs_t           *statp)
+        struct kstatfs          *statp)
 {
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 1a3b803dfa5..68139b38aed 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -127,7 +127,7 @@ xfs_qm_quotactl(
                break;
        case Q_XQUOTASYNC:
-                return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL));
+                return xfs_sync_inodes(mp, SYNC_DELWRI);
        default:
                break;
@@ -1022,101 +1022,104 @@ xfs_qm_export_flags(
 /*
- * Go thru all the inodes in the file system, releasing their dquots.
+ * Release all the dquots on the inodes in an AG.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
 */
-void
+STATIC void
-xfs_qm_dqrele_all_inodes(
+xfs_qm_dqrele_inodes_ag(
-        struct xfs_mount *mp,
+        xfs_mount_t     *mp,
-        uint             flags)
+        int             ag,
+        uint            flags)
 {
-        xfs_inode_t     *ip, *topino;
+        xfs_inode_t     *ip = NULL;
-        uint            ireclaims;
+        xfs_perag_t     *pag = &mp->m_perag[ag];
-        struct inode    *vp;
+        int             first_index = 0;
-        boolean_t       vnode_refd;
+        int             nr_found;
-        ASSERT(mp->m_quotainfo);
-        XFS_MOUNT_ILOCK(mp);
-again:
-        ip = mp->m_inodes;
-        if (ip == NULL) {
-                XFS_MOUNT_IUNLOCK(mp);
-                return;
-        }
        do {
-                /* Skip markers inserted by xfs_sync */
+                /*
-                if (ip->i_mount == NULL) {
+                 * use a gang lookup to find the next inode in the tree
-                        ip = ip->i_mnext;
+                 * as the tree is sparse and a gang lookup walks to find
-                        continue;
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                (void**)&ip, first_index, 1);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
                }
-                /* Root inode, rbmip and rsumip have associated blocks */
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* skip quota inodes */
                if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
-                        ip = ip->i_mnext;
+                        read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-                vp = VFS_I(ip);
-                if (!vp) {
+                /*
-                        ASSERT(ip->i_udquot == NULL);
+                 * If we can't get a reference on the inode, it must be
-                        ASSERT(ip->i_gdquot == NULL);
+                 * in reclaim. Leave it for the reclaim code to flush.
-                        ip = ip->i_mnext;
+                 */
+                if (!igrab(VFS_I(ip))) {
+                        read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-                vnode_refd = B_FALSE;
+                read_unlock(&pag->pag_ici_lock);
-                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                        ireclaims = mp->m_ireclaims;
+                /* avoid new inodes though we shouldn't find any here */
-                        topino = mp->m_inodes;
+                if (xfs_iflags_test(ip, XFS_INEW)) {
-                        vp = vn_grab(vp);
+                        IRELE(ip);
-                        if (!vp)
+                        continue;
-                                goto again;
-                        XFS_MOUNT_IUNLOCK(mp);
-                        /* XXX restart limit ? */
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        vnode_refd = B_TRUE;
-                } else {
-                        ireclaims = mp->m_ireclaims;
-                        topino = mp->m_inodes;
-                        XFS_MOUNT_IUNLOCK(mp);
                }
-                /*
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                 * We don't keep the mountlock across the dqrele() call,
-                 * since it can take a while..
-                 */
                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                        xfs_qm_dqrele(ip->i_udquot);
                        ip->i_udquot = NULL;
                }
-                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+                    ip->i_gdquot) {
                        xfs_qm_dqrele(ip->i_gdquot);
                        ip->i_gdquot = NULL;
                }
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_iput(ip, XFS_ILOCK_EXCL);
-                /*
-                 * Wait until we've dropped the ilock and mountlock to
+        } while (nr_found);
-                 * do the vn_rele. Or be condemned to an eternity in the
+}
-                 * inactive code in hell.
-                 */
+/*
-                if (vnode_refd)
+ * Go thru all the inodes in the file system, releasing their dquots.
-                        IRELE(ip);
+ * Note that the mount structure gets modified to indicate that quotas are off
-                XFS_MOUNT_ILOCK(mp);
+ * AFTER this, in the case of quotaoff. This also gets called from
-                /*
+ * xfs_rootumount.
-                 * If an inode was inserted or removed, we gotta
+ */
-                 * start over again.
+void
-                 */
+xfs_qm_dqrele_all_inodes(
-                if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
+        struct xfs_mount *mp,
-                        /* XXX use a sentinel */
+        uint             flags)
-                        goto again;
+{
-                }
+        int             i;
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
-        XFS_MOUNT_IUNLOCK(mp);
+        ASSERT(mp->m_quotainfo);
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                xfs_qm_dqrele_inodes_ag(mp, i, flags);
+        }
 }
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index c27abef7b84..ae548296542 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -18,6 +18,13 @@
 #include <xfs.h>
 #include "debug.h"
+/* xfs_mount.h drags a lot of crap in, sorry.. */
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 static char             message[1024];  /* keep it off the stack */
 static DEFINE_SPINLOCK(xfs_err_lock);
@@ -55,22 +62,42 @@ cmn_err(register int level, char *fmt, ...)
 }
 void
-icmn_err(register int level, char *fmt, va_list ap)
+xfs_fs_vcmn_err(
+        int                     level,
+        struct xfs_mount        *mp,
+        char                    *fmt,
+        va_list                 ap)
 {
-        ulong   flags;
+        unsigned long           flags;
-        int     len;
+        int                     len = 0;
        level &= XFS_ERR_MASK;
-        if(level > XFS_MAX_ERR_LEVEL)
+        if (level > XFS_MAX_ERR_LEVEL)
                level = XFS_MAX_ERR_LEVEL;
        spin_lock_irqsave(&xfs_err_lock,flags);
-        len = vsnprintf(message, sizeof(message), fmt, ap);
+        if (mp) {
+                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+                /*
+                 * Skip the printk if we can't print anything useful
+                 * due to an over-long device name.
+                 */
+                if (len >= sizeof(message))
+                        goto out;
+        }
+        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
        if (len >= sizeof(message))
                len = sizeof(message) - 1;
        if (message[len-1] == '\n')
                message[len-1] = 0;
        printk("%s%s\n", err_level[level], message);
+ out:
        spin_unlock_irqrestore(&xfs_err_lock,flags);
        BUG_ON(level == CE_PANIC);
 }
@@ -84,5 +111,5 @@ assfail(char *expr, char *file, int line)
 void
 xfs_hex_dump(void *p, int length)
 {
-        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
+        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
 }
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 75845f95081..6f4fd37c67a 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -27,8 +27,6 @@
 #define CE_ALERT        1               /* alert        */
 #define CE_PANIC        0               /* panic        */
-extern void icmn_err(int, char *, va_list)
-        __attribute__ ((format (printf, 2, 0)));
 extern void cmn_err(int, char *, ...)
        __attribute__ ((format (printf, 2, 3)));
 extern void assfail(char *expr, char *f, int l);
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index a34ef05489b..2d494c26717 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -113,21 +113,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 void
 ktrace_free(ktrace_t *ktp)
 {
-        int     entries_size;
        if (ktp == (ktrace_t *)NULL)
                return;
        /*
         * Special treatment for the Vnode trace buffer.
         */
-        if (ktp->kt_nentries == ktrace_zentries) {
+        if (ktp->kt_nentries == ktrace_zentries)
                kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
-        } else {
+        else
-                entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
                kmem_free(ktp->kt_entries);
-        }
        kmem_zone_free(ktrace_hdr_zone, ktp);
 }
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c98982..17254b529c5 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -30,7 +30,7 @@
 #define XFS_ATTR_TRACE 1
 #define XFS_BLI_TRACE 1
 #define XFS_BMAP_TRACE 1
-#define XFS_BMBT_TRACE 1
+#define XFS_BTREE_TRACE 1
 #define XFS_DIR2_TRACE 1
 #define XFS_DQUOT_TRACE 1
 #define XFS_ILOCK_TRACE 1
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index b2f639a1416..a8cdd73999a 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -366,7 +366,7 @@ xfs_acl_allow_set(
                return ENOTDIR;
        if (vp->i_sb->s_flags & MS_RDONLY)
                return EROFS;
-        if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
+        if (XFS_I(vp)->i_d.di_uid != current_fsuid() && !capable(CAP_FOWNER))
                return EPERM;
        return 0;
 }
@@ -413,13 +413,13 @@ xfs_acl_access(
                switch (fap->acl_entry[i].ae_tag) {
                case ACL_USER_OBJ:
                        seen_userobj = 1;
-                        if (fuid != current->fsuid)
+                        if (fuid != current_fsuid())
                                continue;
                        matched.ae_tag = ACL_USER_OBJ;
                        matched.ae_perm = allows;
                        break;
                case ACL_USER:
-                        if (fap->acl_entry[i].ae_id != current->fsuid)
+                        if (fap->acl_entry[i].ae_id != current_fsuid())
                                continue;
                        matched.ae_tag = ACL_USER;
                        matched.ae_perm = allows;
@@ -758,7 +758,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
+        return xfs_setattr(XFS_I(vp), &iattr, 0);
 }
 /*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 61b292a9fb4..f2e21817a22 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,6 +91,8 @@ typedef struct xfs_agf {
 #define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 /*
 * Size of the unlinked inode hash table in the agi.
@@ -142,6 +144,9 @@ typedef struct xfs_agi {
 #define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
 #define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+                                xfs_agnumber_t agno, struct xfs_buf **bpp);
 /*
 * The third a.g. block contains the a.g. freelist, an array
 * of block pointers to blocks owned by the allocation btree code.
@@ -192,17 +197,23 @@ typedef struct xfs_perag
        xfs_agino_t     pagi_freecount; /* number of free inodes */
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
        int             pagb_count;     /* pagb slots in use */
+        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
 #ifdef __KERNEL__
        spinlock_t      pagb_lock;      /* lock for pagb_list */
-#endif
-        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+#endif
 } xfs_perag_t;
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
 #define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
 #define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1956f83489f..028e44e58ea 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -90,6 +90,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
 */
 /*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_alloc_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len)    /* length of extent */
+{
+        union xfs_btree_rec     rec;
+        rec.alloc.ar_startblock = cpu_to_be32(bno);
+        rec.alloc.ar_blockcount = cpu_to_be32(len);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+STATIC int                              /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                *bno = be32_to_cpu(rec->alloc.ar_startblock);
+                *len = be32_to_cpu(rec->alloc.ar_blockcount);
+        }
+        return error;
+}
+/*
 * Compute aligned version of the found extent.
 * Takes alignment and min length into account.
 */
@@ -294,21 +380,20 @@ xfs_alloc_fixup_trees(
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
 #ifdef DEBUG
-        {
+        if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
-                xfs_alloc_block_t       *bnoblock;
+                struct xfs_btree_block  *bnoblock;
-                xfs_alloc_block_t       *cntblock;
+                struct xfs_btree_block  *cntblock;
-                if (bno_cur->bc_nlevels == 1 &&
+                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
-                    cnt_cur->bc_nlevels == 1) {
+                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
-                        bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
-                        cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
+                XFS_WANT_CORRUPTED_RETURN(
-                        XFS_WANT_CORRUPTED_RETURN(
+                        bnoblock->bb_numrecs == cntblock->bb_numrecs);
-                                be16_to_cpu(bnoblock->bb_numrecs) ==
-                                be16_to_cpu(cntblock->bb_numrecs));
-                }
        }
 #endif
        /*
         * Deal with all four cases: the allocated record is contained
         * within the freespace record, so we can have new freespace
@@ -333,7 +418,7 @@ xfs_alloc_fixup_trees(
        /*
         * Delete the entry from the by-size btree.
         */
-        if ((error = xfs_alloc_delete(cnt_cur, &i)))
+        if ((error = xfs_btree_delete(cnt_cur, &i)))
                return error;
        XFS_WANT_CORRUPTED_RETURN(i == 1);
        /*
@@ -343,7 +428,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -351,7 +436,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -362,7 +447,7 @@ xfs_alloc_fixup_trees(
                /*
                 * No remaining freespace, just delete the by-block tree entry.
                 */
-                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                if ((error = xfs_btree_delete(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        } else {
@@ -379,7 +464,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                if ((error = xfs_btree_insert(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact(
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
-        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
@@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact(
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
@@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Get a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        ltlen = 0;
        bno_cur_lt = bno_cur_gt = NULL;
        /*
@@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near(
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (ltlen >= args->minlen)
                                        break;
-                                if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
                                        goto error0;
                        } while (i);
                        ASSERT(ltlen >= args->minlen);
@@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near(
                i = cnt_cur->bc_ptrs[0];
                for (j = 1, blen = 0, bdiff = 0;
                     !error && j && (blen < args->maxlen || bdiff > 0);
-                     error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+                     error = xfs_btree_increment(cnt_cur, 0, &j)) {
                        /*
                         * For each entry, decide if it's better than
                         * the previous best entry.
@@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near(
                /*
                 * Set up a cursor for the by-bno tree.
                 */
-                bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
+                bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
-                        args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+                        args->agbp, args->agno, XFS_BTNUM_BNO);
                /*
                 * Fix up the btree entries.
                 */
@@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Allocate and initialize the cursor for the leftward search.
         */
-        bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup <= bno to find the leftward search's starting point.
         */
@@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near(
         * Increment the cursor, so we will point at the entry just right
         * of the leftward entry if any, or to the leftmost entry.
         */
-        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                goto error0;
        if (!i) {
                /*
@@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
-                        if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_lt,
@@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
-                        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_gt,
@@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the right end.
                                         */
-                                        if ((error = xfs_alloc_increment(
+                                        if ((error = xfs_btree_increment(
                                                        bno_cur_gt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the left end.
                                         */
-                                        if ((error = xfs_alloc_decrement(
+                                        if ((error = xfs_btree_decrement(
                                                        bno_cur_lt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size(
                bestflen = flen;
                bestfbno = fbno;
                for (;;) {
-                        if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
                                goto error0;
                        if (i == 0)
                                break;
@@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-block tree.
         */
-        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
                goto error0;
@@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small(
        xfs_extlen_t    flen;
        int             i;
-        if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+        if ((error = xfs_btree_decrement(ccur, 0, &i)))
                goto error0;
        if (i) {
                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1515,8 +1600,7 @@ xfs_free_ag_extent(
        /*
         * Allocate and initialize a cursor for the by-block btree.
         */
-        bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
+        bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
-                0);
        cnt_cur = NULL;
        /*
         * Look for a neighboring block on the left (lower block numbers)
@@ -1549,7 +1633,7 @@ xfs_free_ag_extent(
         * Look for a neighboring block on the right (higher block numbers)
         * that is contiguous with this space.
         */
-        if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+        if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
                goto error0;
        if (haveright) {
                /*
@@ -1575,8 +1659,7 @@ xfs_free_ag_extent(
        /*
         * Now allocate and initialize a cursor for the by-size tree.
         */
-        cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
+        cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
-                0);
        /*
         * Have both left and right contiguous neighbors.
         * Merge all three into a single free block.
@@ -1588,7 +1671,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1597,19 +1680,19 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Delete the old by-block entry for the right block.
                 */
-                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                if ((error = xfs_btree_delete(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Move the by-block cursor back to the left neighbor.
                 */
-                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 #ifdef DEBUG
@@ -1648,14 +1731,14 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Back up the by-block cursor to the left neighbor, and
                 * update its length.
                 */
-                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                nbno = ltbno;
@@ -1674,7 +1757,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1693,7 +1776,7 @@ xfs_free_ag_extent(
        else {
                nbno = bno;
                nlen = len;
-                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                if ((error = xfs_btree_insert(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        }
@@ -1705,7 +1788,7 @@ xfs_free_ag_extent(
        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-        if ((error = xfs_alloc_insert(cnt_cur, &i)))
+        if ((error = xfs_btree_insert(cnt_cur, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -2150,51 +2233,83 @@ xfs_alloc_put_freelist(
 * Read in the allocation group header (free/alloc section).
 */
 int                                     /* error */
-xfs_alloc_read_agf(
+xfs_read_agf(
-        xfs_mount_t     *mp,            /* mount point structure */
+        struct xfs_mount        *mp,    /* mount point structure */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,   /* allocation group number */
-        int             flags,          /* XFS_ALLOC_FLAG_... */
+        int                     flags,  /* XFS_BUF_ */
-        xfs_buf_t       **bpp)          /* buffer for the ag freelist header */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
 {
-        xfs_agf_t       *agf;           /* ag freelist header */
+        struct xfs_agf  *agf;           /* ag freelist header */
        int             agf_ok;         /* set if agf is consistent */
-        xfs_buf_t       *bp;            /* return value */
-        xfs_perag_t     *pag;           /* per allocation group data */
        int             error;
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1),
+                        XFS_FSS_TO_BB(mp, 1), flags, bpp);
-                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
-                        &bp);
        if (error)
                return error;
-        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        if (!*bpp)
-        if (!bp) {
-                *bpp = NULL;
                return 0;
-        }
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        agf = XFS_BUF_TO_AGF(*bpp);
        /*
         * Validate the magic number of the agf block.
         */
-        agf = XFS_BUF_TO_AGF(bp);
        agf_ok =
                be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
+                be32_to_cpu(agf->agf_seqno) == agno;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+                agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                                be32_to_cpu(agf->agf_length);
        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
                        XFS_RANDOM_ALLOC_READ_AGF))) {
                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
                                     XFS_ERRLEVEL_LOW, mp, agf);
-                xfs_trans_brelse(tp, bp);
+                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+        return 0;
+}
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error */
+xfs_alloc_read_agf(
+        struct xfs_mount        *mp,    /* mount point structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        int                     flags,  /* XFS_ALLOC_FLAG_... */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+        struct xfs_agf          *agf;           /* ag freelist header */
+        struct xfs_perag        *pag;           /* per allocation group data */
+        int                     error;
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_read_agf(mp, tp, agno,
+                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+                        bpp);
+        if (error)
+                return error;
+        if (!*bpp)
+                return 0;
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        agf = XFS_BUF_TO_AGF(*bpp);
        pag = &mp->m_perag[agno];
        if (!pag->pagf_init) {
                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
@@ -2213,6 +2328,7 @@ xfs_alloc_read_agf(
 #ifdef DEBUG
        else if (!XFS_FORCED_SHUTDOWN(mp)) {
                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+                ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2221,8 +2337,6 @@ xfs_alloc_read_agf(
                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
        }
 #endif
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
-        *bpp = bp;
        return 0;
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 5aec15d0651..588172796f7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf;
 #define XFS_ALLOC_KTRACE_BUSYSEARCH     6
 #endif
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+                xfs_agnumber_t agno,
+                xfs_agblock_t bno,
+                xfs_extlen_t len);
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+                xfs_agnumber_t ag,
+                int idx);
+#endif  /* __KERNEL__ */
 /*
 * Compute and fill in value of m_ag_maxlevels.
 */
@@ -196,18 +209,4 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-                xfs_agnumber_t agno,
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-                xfs_agnumber_t ag,
-                int idx);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3ce2645508a..733cb75a8c5 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -35,2177 +35,464 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
-/*
- * Prototypes for internal functions.
- */
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC struct xfs_btree_cur *
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+xfs_allocbt_dup_cursor(
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+        struct xfs_btree_cur    *cur)
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+{
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
+        return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno,
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
+                        cur->bc_btnum);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+}
-                xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
-/*
+STATIC void
- * Internal functions.
+xfs_allocbt_set_root(
- */
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     inc)
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+        int                     btnum = cur->bc_btnum;
-/*
+        ASSERT(ptr->s != 0);
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
+        agf->agf_roots[btnum] = ptr->s;
- * Remove the record from its block then rebalance the tree.
+        be32_add_cpu(&agf->agf_levels[btnum], inc);
- * Return 0 for error, 1 for done, 2 to go on to the next level.
+        cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
- */
-STATIC int                              /* error */
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-xfs_alloc_delrec(
+}
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level removing record from */
+STATIC int
-        int                     *stat)  /* fail/done/go-on */
+xfs_allocbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     length,
+        int                     *stat)
 {
-        xfs_agf_t               *agf;   /* allocation group freelist header */
+        int                     error;
-        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_agblock_t           bno;
-        xfs_agblock_t           bno;    /* btree block number */
-        xfs_buf_t               *bp;    /* buffer for block */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* kp points here if block is level 0 */
-        xfs_agblock_t           lbno;   /* left block's block number */
-        xfs_buf_t               *lbp;   /* left block's buffer pointer */
-        xfs_alloc_block_t       *left;  /* left btree block */
-        xfs_alloc_key_t         *lkp=NULL;      /* left block key pointer */
-        xfs_alloc_ptr_t         *lpp=NULL;      /* left block address pointer */
-        int                     lrecs=0;        /* number of records in left block */
-        xfs_alloc_rec_t         *lrp;   /* left block record pointer */
-        xfs_mount_t             *mp;    /* mount structure */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_agblock_t           rbno;   /* right block's block number */
-        xfs_buf_t               *rbp;   /* right block's buffer pointer */
-        xfs_alloc_block_t       *right; /* right btree block */
-        xfs_alloc_key_t         *rkp;   /* right block key pointer */
-        xfs_alloc_ptr_t         *rpp;   /* right block address pointer */
-        int                     rrecs=0;        /* number of records in right block */
-        int                     numrecs;
-        xfs_alloc_rec_t         *rrp;   /* right block record pointer */
-        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-        /*
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-         * Get the index of the entry being deleted, check for nothing there.
-         */
+        /* Allocate the new block from the freelist. If we can't, give up.  */
-        ptr = cur->bc_ptrs[level];
+        error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-        if (ptr == 0) {
+                                       &bno, 1);
-                *stat = 0;
+        if (error) {
-                return 0;
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-        }
-        /*
-         * Get the buffer & block containing the record or key/ptr.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
                return error;
-#endif
+        }
-        /*
-         * Fail if we're off the end of the block.
+        if (bno == NULLAGBLOCK) {
-         */
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (ptr > numrecs) {
                *stat = 0;
                return 0;
        }
-        XFS_STATS_INC(xs_abt_delrec);
-        /*
-         * It's a nonleaf.  Excise the key and ptr being deleted, by
-         * sliding the entries past them down one.
-         * Log the changed areas of the block.
-         */
-        if (level > 0) {
-                lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&lkp[ptr - 1], &lkp[ptr],
-                                (numrecs - ptr) * sizeof(*lkp));
-                        memmove(&lpp[ptr - 1], &lpp[ptr],
-                                (numrecs - ptr) * sizeof(*lpp));
-                        xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
-                        xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
-                }
-        }
-        /*
-         * It's a leaf.  Excise the record being deleted, by sliding the
-         * entries past it down one.  Log the changed areas of the block.
-         */
-        else {
-                lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&lrp[ptr - 1], &lrp[ptr],
-                                (numrecs - ptr) * sizeof(*lrp));
-                        xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                /*
-                 * If it's the first record in the block, we'll need a key
-                 * structure to pass up to the next level (updkey).
-                 */
-                if (ptr == 1) {
-                        key.ar_startblock = lrp->ar_startblock;
-                        key.ar_blockcount = lrp->ar_blockcount;
-                        lkp = &key;
-                }
-        }
-        /*
-         * Decrement and log the number of entries in the block.
-         */
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * See if the longest free extent in the allocation group was
-         * changed by this operation.  True if it's the by-size btree, and
-         * this is the leaf level, and there is no right sibling block,
-         * and this was the last record.
-         */
-        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        mp = cur->bc_mp;
-        if (level == 0 &&
+        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-            cur->bc_btnum == XFS_BTNUM_CNT &&
+        new->s = cpu_to_be32(bno);
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-            ptr > numrecs) {
-                ASSERT(ptr == numrecs + 1);
-                /*
-                 * There are still records in the block.  Grab the size
-                 * from the last one.
-                 */
-                if (numrecs) {
-                        rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
-                        agf->agf_longest = rrp->ar_blockcount;
-                }
-                /*
-                 * No free extents left.
-                 */
-                else
-                        agf->agf_longest = 0;
-                mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
-                        be32_to_cpu(agf->agf_longest);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
-        }
-        /*
-         * Is this the root level?  If so, we're almost done.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                /*
-                 * If this is the root level,
-                 * and there's only one entry left,
-                 * and it's NOT the leaf level,
-                 * then we can get rid of this level.
-                 */
-                if (numrecs == 1 && level > 0) {
-                        /*
-                         * lpp is still set to the first pointer in the block.
-                         * Make it the new root of the btree.
-                         */
-                        bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-                        agf->agf_roots[cur->bc_btnum] = *lpp;
-                        be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
-                        mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
-                        /*
-                         * Put this buffer/block on the ag's freelist.
-                         */
-                        error = xfs_alloc_put_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, NULL, bno, 1);
-                        if (error)
-                                return error;
-                        /*
-                         * Since blocks move to the free list without the
-                         * coordination used in xfs_bmap_finish, we can't allow
-                         * block to be available for reallocation and
-                         * non-transaction writing (user data) until we know
-                         * that the transaction that moved it to the free list
-                         * is permanently on disk. We track the blocks by
-                         * declaring these blocks as "busy"; the busy list is
-                         * maintained on a per-ag basis and each transaction
-                         * records which entries should be removed when the
-                         * iclog commits to disk. If a busy block is
-                         * allocated, the iclog is pushed up to the LSN
-                         * that freed the block.
-                         */
-                        xfs_alloc_mark_busy(cur->bc_tp,
-                                be32_to_cpu(agf->agf_seqno), bno, 1);
-                        xfs_trans_agbtree_delta(cur->bc_tp, -1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+        *stat = 1;
-                                XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+        return 0;
-                        /*
+}
-                         * Update the cursor so there's one fewer level.
-                         */
-                        xfs_btree_setbuf(cur, level, NULL);
-                        cur->bc_nlevels--;
-                } else if (level > 0 &&
-                           (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we deleted the leftmost entry in the block, update the
-         * key values above us in the tree.
-         */
-        if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
-                return error;
-        /*
-         * If the number of records remaining in the block is at least
-         * the minimum, we're done.
-         */
-        if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Otherwise, we have to move some records around to keep the
-         * tree balanced.  Look at the left and right sibling blocks to
-         * see if we can re-balance by moving only one record.
-         */
-        rbno = be32_to_cpu(block->bb_rightsib);
-        lbno = be32_to_cpu(block->bb_leftsib);
-        bno = NULLAGBLOCK;
-        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-        /*
-         * Duplicate the cursor so our btree manipulations here won't
-         * disrupt the next level up.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        /*
-         * If there's a right sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (rbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the last entry in the next block.
-                 * Actually any entry but the first would suffice.
-                 */
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_increment(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(right->bb_leftsib);
-                /*
-                 * If right block is full enough so that removing one entry
-                 * won't make it too empty, and left-shifting an entry out
-                 * of right to us works, we're done.
-                 */
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_alloc_lshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level > 0 &&
-                                    (error = xfs_alloc_decrement(cur, level,
-                                            &i)))
-                                        return error;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference, and fix up the temp cursor to point
-                 * to our block again (last record).
-                 */
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLAGBLOCK) {
-                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                }
-        }
-        /*
-         * If there's a left sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (lbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the first entry in the
-                 * previous block.
-                 */
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                xfs_btree_firstrec(tcur, level);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(left->bb_rightsib);
-                /*
-                 * If left block is full enough so that removing one entry
-                 * won't make it too empty, and right-shifting an entry out
-                 * of left to us works, we're done.
-                 */
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_alloc_rshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference.
-                 */
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * Delete the temp cursor, we're done with it.
-         */
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        /*
-         * If here, we need to do a join to keep the tree balanced.
-         */
-        ASSERT(bno != NULLAGBLOCK);
-        /*
-         * See if we can join with the left neighbor block.
-         */
-        if (lbno != NULLAGBLOCK &&
-            lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "right" to be the starting block,
-                 * "left" to be the left neighbor.
-                 */
-                rbno = bno;
-                right = block;
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                rbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        return error;
-        }
-        /*
-         * If that won't work, see if we can join with the right neighbor block.
-         */
-        else if (rbno != NULLAGBLOCK &&
-                 rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "left" to be the starting block,
-                 * "right" to be the right neighbor.
-                 */
-                lbno = bno;
-                left = block;
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                lbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        return error;
-        }
-        /*
-         * Otherwise, we can't fix the imbalance.
-         * Just return.  This is probably a logic error, but it's not fatal.
-         */
-        else {
-                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * We're now going to join "left" and "right" by moving all the stuff
-         * in "right" to "left" and deleting "right".
-         */
-        if (level > 0) {
-                /*
-                 * It's a non-leaf.  Move keys and pointers.
-                 */
-                lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-                xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-                xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        } else {
-                /*
-                 * It's a leaf.  Move records.
-                 */
-                lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-                xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        }
-        /*
-         * If we joined with the left neighbor, set the buffer in the
-         * cursor to the left block, and fix up the index.
-         */
-        if (bp != lbp) {
-                xfs_btree_setbuf(cur, level, lbp);
-                cur->bc_ptrs[level] += lrecs;
-        }
-        /*
-         * If we joined with the right neighbor and there's a level above
-         * us, increment the cursor at that level.
-         */
-        else if (level + 1 < cur->bc_nlevels &&
-                 (error = xfs_alloc_increment(cur, level + 1, &i)))
-                return error;
-        /*
-         * Fix up the number of records in the surviving block.
-         */
-        lrecs += rrecs;
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        /*
-         * Fix up the right block pointer in the surviving block, and log it.
-         */
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there is a right sibling now, make it point to the
-         * remaining block.
-         */
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                xfs_alloc_block_t       *rrblock;
-                xfs_buf_t               *rrbp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+STATIC int
-                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
+xfs_allocbt_free_block(
-                                &rrbp, XFS_ALLOC_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        struct xfs_buf          *bp)
-                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+{
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-                        return error;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-                rrblock->bb_leftsib = cpu_to_be32(lbno);
+        xfs_agblock_t           bno;
-                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        int                     error;
-        }
-        /*
+        bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
-         * Free the deleting block by putting it on the freelist.
+        error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
-         */
-        error = xfs_alloc_put_freelist(cur->bc_tp,
-                                         cur->bc_private.a.agbp, NULL, rbno, 1);
        if (error)
                return error;
        /*
-         * Since blocks move to the free list without the coordination
+         * Since blocks move to the free list without the coordination used in
-         * used in xfs_bmap_finish, we can't allow block to be available
+         * xfs_bmap_finish, we can't allow block to be available for
-         * for reallocation and non-transaction writing (user data)
+         * reallocation and non-transaction writing (user data) until we know
-         * until we know that the transaction that moved it to the free
+         * that the transaction that moved it to the free list is permanently
-         * list is permanently on disk. We track the blocks by declaring
+         * on disk. We track the blocks by declaring these blocks as "busy";
-         * these blocks as "busy"; the busy list is maintained on a
+         * the busy list is maintained on a per-ag basis and each transaction
-         * per-ag basis and each transaction records which entries
+         * records which entries should be removed when the iclog commits to
-         * should be removed when the iclog commits to disk. If a
+         * disk. If a busy block is allocated, the iclog is pushed up to the
-         * busy block is allocated, the iclog is pushed up to the
         * LSN that freed the block.
         */
        xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
-        /*
-         * Adjust the current level's cursor so that we're left referring
-         * to the right node, after we're done.
-         * If this leaves the ptr value 0 our caller will fix it up.
-         */
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        /*
-         * Return value means the next level up has something to do.
-         */
-        *stat = 2;
        return 0;
-error0:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
 }
 /*
- * Insert one record/level.  Return information to the caller
+ * Update the longest extent in the AGF
- * allowing the next level up to proceed if necessary.
 */
-STATIC int                              /* error */
+STATIC void
-xfs_alloc_insrec(
+xfs_allocbt_update_lastrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        int                     level,  /* level to insert record at */
+        struct xfs_btree_block  *block,
-        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
+        union xfs_btree_rec     *rec,
-        xfs_alloc_rec_t         *recp,  /* i/o: record data inserted */
+        int                     ptr,
-        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
+        int                     reason)
-        int                     *stat)  /* output: success/failure */
 {
-        xfs_agf_t               *agf;   /* allocation group freelist header */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
-        xfs_buf_t               *bp;    /* buffer for block */
+        __be32                  len;
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* key value being inserted */
-        xfs_alloc_key_t         *kp;    /* pointer to btree keys */
-        xfs_agblock_t           nbno;   /* block number of allocated block */
-        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-        xfs_alloc_key_t         nkey;   /* new key value, from split */
-        xfs_alloc_rec_t         nrec;   /* new record value, for caller */
        int                     numrecs;
-        int                     optr;   /* old ptr value */
-        xfs_alloc_ptr_t         *pp;    /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_alloc_rec_t         *rp;    /* pointer to btree records */
-        ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
+        ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+        switch (reason) {
+        case LASTREC_UPDATE:
+                /*
+                 * If this is the last leaf block and it's the last record,
+                 * then update the size of the longest extent in the AG.
+                 */
+                if (ptr != xfs_btree_get_numrecs(block))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_INSREC:
+                if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+                    be32_to_cpu(agf->agf_longest))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_DELREC:
+                numrecs = xfs_btree_get_numrecs(block);
+                if (ptr <= numrecs)
+                        return;
+                ASSERT(ptr == numrecs + 1);
-        /*
+                if (numrecs) {
-         * GCC doesn't understand the (arguably complex) control flow in
+                        xfs_alloc_rec_t *rrp;
-         * this function and complains about uninitialized structure fields
-         * without this.
-         */
-        memset(&nrec, 0, sizeof(nrec));
-        /*
+                        rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
-         * If we made it to the root level, allocate a new root block
+                        len = rrp->ar_blockcount;
-         * and we're done.
-         */
-        if (level >= cur->bc_nlevels) {
-                XFS_STATS_INC(xs_abt_insrec);
-                if ((error = xfs_alloc_newroot(cur, &i)))
-                        return error;
-                *bnop = NULLAGBLOCK;
-                *stat = i;
-                return 0;
-        }
-        /*
-         * Make a key out of the record data to be inserted, and save it.
-         */
-        key.ar_startblock = recp->ar_startblock;
-        key.ar_blockcount = recp->ar_blockcount;
-        optr = ptr = cur->bc_ptrs[level];
-        /*
-         * If we're off the left edge, return failure.
-         */
-        if (ptr == 0) {
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_abt_insrec);
-        /*
-         * Get pointers to the btree buffer and block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-        /*
-         * Check that the new entry is being inserted in the right place.
-         */
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
                } else {
-                        kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+                        len = 0;
-                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLAGBLOCK;
-        ncur = NULL;
-        /*
-         * If the block is full, we can't insert the new entry until we
-         * make the block un-full.
-         */
-        if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * First, try shifting an entry to the right neighbor.
-                 */
-                if ((error = xfs_alloc_rshift(cur, level, &i)))
-                        return error;
-                if (i) {
-                        /* nothing */
-                }
-                /*
-                 * Next, try shifting an entry to the left neighbor.
-                 */
-                else {
-                        if ((error = xfs_alloc_lshift(cur, level, &i)))
-                                return error;
-                        if (i)
-                                optr = ptr = cur->bc_ptrs[level];
-                        else {
-                                /*
-                                 * Next, try splitting the current block in
-                                 * half. If this works we have to re-set our
-                                 * variables because we could be in a
-                                 * different block now.
-                                 */
-                                if ((error = xfs_alloc_split(cur, level, &nbno,
-                                                &nkey, &ncur, &i)))
-                                        return error;
-                                if (i) {
-                                        bp = cur->bc_bufs[level];
-                                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                                        if ((error =
-                                                xfs_btree_check_sblock(cur,
-                                                        block, level, bp)))
-                                                return error;
-#endif
-                                        ptr = cur->bc_ptrs[level];
-                                        nrec.ar_startblock = nkey.ar_startblock;
-                                        nrec.ar_blockcount = nkey.ar_blockcount;
-                                }
-                                /*
-                                 * Otherwise the insert fails.
-                                 */
-                                else {
-                                        *stat = 0;
-                                        return 0;
-                                }
-                        }
-                }
-        }
-        /*
-         * At this point we know there's room for our new entry in the block
-         * we're pointing at.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                /*
-                 * It's a non-leaf entry.  Make a hole for the new data
-                 * in the key and ptr regions of the block.
-                 */
-                kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                                return error;
                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                        return error;
-#endif
-                /*
-                 * Now stuff the new data in, bump numrecs and log the new data.
-                 */
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be32(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_alloc_log_keys(cur, bp, ptr, numrecs);
-                xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-                if (ptr < numrecs)
-                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                                kp + ptr);
-#endif
-        } else {
-                /*
-                 * It's a leaf entry.  Make a hole for the new record.
-                 */
-                rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                /*
-                 * Now stuff the new record in, bump numrecs
-                 * and log the new data.
-                 */
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_alloc_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-                if (ptr < numrecs)
-                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                                rp + ptr);
-#endif
-        }
-        /*
-         * Log the new number of records in the btree header.
-         */
-        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * If we inserted at the start of a block, update the parents' keys.
-         */
-        if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
-                return error;
-        /*
-         * Look to see if the longest extent in the allocation group
-         * needs to be updated.
-         */
-        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+                break;
-        if (level == 0 &&
+        default:
-            cur->bc_btnum == XFS_BTNUM_CNT &&
+                ASSERT(0);
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
+                return;
-            be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
-                /*
-                 * If this is a leaf in the by-size btree and there
-                 * is no right sibling block and this block is bigger
-                 * than the previous longest block, update it.
-                 */
-                agf->agf_longest = recp->ar_blockcount;
-                cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
-                        = be32_to_cpu(recp->ar_blockcount);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
        }
-        /*
-         * Return the new block number, if any.
+        agf->agf_longest = len;
-         * If there is one, give back a record value and a cursor too.
+        cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
-         */
+        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
-        *bnop = nbno;
-        if (nbno != NULLAGBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Log header fields from a btree block.
+xfs_allocbt_get_minrecs(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        int                     level)
-xfs_alloc_log_block(
-        xfs_trans_t             *tp,    /* transaction pointer */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     fields) /* mask of fields: XFS_BB_... */
 {
-        int                     first;  /* first byte offset logged */
+        return cur->bc_mp->m_alloc_mnr[level != 0];
-        int                     last;   /* last byte offset logged */
+}
-        static const short      offsets[] = {   /* table of offsets */
-                offsetof(xfs_alloc_block_t, bb_magic),
-                offsetof(xfs_alloc_block_t, bb_level),
-                offsetof(xfs_alloc_block_t, bb_numrecs),
-                offsetof(xfs_alloc_block_t, bb_leftsib),
-                offsetof(xfs_alloc_block_t, bb_rightsib),
-                sizeof(xfs_alloc_block_t)
-        };
-        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+STATIC int
-        xfs_trans_log_buf(tp, bp, first, last);
+xfs_allocbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        return cur->bc_mp->m_alloc_mxr[level != 0];
 }
-/*
- * Log keys from a btree block (nonleaf).
- */
 STATIC void
-xfs_alloc_log_keys(
+xfs_allocbt_init_key_from_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     kfirst, /* index of first key to log */
-        int                     klast)  /* index of last key to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(rec->alloc.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        xfs_alloc_key_t         *kp;    /* key pointer in btree block */
-        int                     last;   /* last byte offset logged */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        key->alloc.ar_startblock = rec->alloc.ar_startblock;
-        kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+        key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
-        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_alloc_log_ptrs(
+xfs_allocbt_init_rec_from_key(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     pfirst, /* index of first pointer to log */
-        int                     plast)  /* index of last pointer to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(key->alloc.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_alloc_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        rec->alloc.ar_startblock = key->alloc.ar_startblock;
-        pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+        rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_alloc_log_recs(
+xfs_allocbt_init_rec_from_cur(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     rfirst, /* index of first record to log */
-        int                     rlast)  /* index of last record to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(cur->bc_rec.a.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_alloc_rec_t         *rp;    /* record pointer for btree block */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-        rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+        rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-#ifdef DEBUG
-        {
-                xfs_agf_t       *agf;
-                xfs_alloc_rec_t *p;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
-                        ASSERT(be32_to_cpu(p->ar_startblock) +
-                               be32_to_cpu(p->ar_blockcount) <=
-                               be32_to_cpu(agf->agf_length));
-        }
-#endif
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
+STATIC void
- * Lookup the record.  The cursor is made to point to it, based on dir.
+xfs_allocbt_init_ptr_from_cur(
- * Return 0 if can't find any such record, 1 for success.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_ptr     *ptr)
-STATIC int                              /* error */
-xfs_alloc_lookup(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_lookup_t            dir,    /* <=, ==, or >= */
-        int                     *stat)  /* success/failure */
 {
-        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_alloc_block_t       *block=NULL;    /* current btree block */
-        int                     diff;   /* difference for the current key */
-        int                     error;  /* error return value */
-        int                     keyno=0;        /* current key number */
-        int                     level;  /* level in the btree */
-        xfs_mount_t             *mp;    /* file system mount point */
-        XFS_STATS_INC(xs_abt_lookup);
-        /*
-         * Get the allocation group header, and the root block number.
-         */
-        mp = cur->bc_mp;
-        {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                agno = be32_to_cpu(agf->agf_seqno);
-                agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-        }
-        /*
-         * Iterate over each level in the btree, starting at the root.
-         * For each level above the leaves, find the key we need, based
-         * on the lookup record, then follow the corresponding block
-         * pointer down to the next level.
-         */
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                xfs_buf_t       *bp;    /* buffer pointer for btree block */
-                xfs_daddr_t     d;      /* disk address of btree block */
-                /*
-                 * Get the disk address we're looking for.
-                 */
-                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                /*
-                 * If the old buffer at this level is for a different block,
-                 * throw it away, otherwise just use it.
-                 */
-                bp = cur->bc_bufs[level];
-                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = NULL;
-                if (!bp) {
-                        /*
-                         * Need to get a new buffer.  Read it, then
-                         * set it in the cursor, releasing the old one.
-                         */
-                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
-                                        agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
-                                return error;
-                        xfs_btree_setbuf(cur, level, bp);
-                        /*
-                         * Point to the btree block, now that we have the buffer
-                         */
-                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                        if ((error = xfs_btree_check_sblock(cur, block, level,
-                                        bp)))
-                                return error;
-                } else
-                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                /*
-                 * If we already had a key match at a higher level, we know
-                 * we need to use the first entry in this block.
-                 */
-                if (diff == 0)
-                        keyno = 1;
-                /*
-                 * Otherwise we need to search this block.  Do a binary search.
-                 */
-                else {
-                        int             high;   /* high entry number */
-                        xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
-                        xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
-                        int             low;    /* low entry number */
-                        /*
-                         * Get a pointer to keys or records.
-                         */
-                        if (level > 0)
-                                kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                        else
-                                krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                        /*
-                         * Set low and high entry numbers, 1-based.
-                         */
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                /*
-                                 * If the block is empty, the tree must
-                                 * be an empty leaf.
-                                 */
-                                ASSERT(level == 0 && cur->bc_nlevels == 1);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                *stat = 0;
-                                return 0;
-                        }
-                        /*
-                         * Binary search the block.
-                         */
-                        while (low <= high) {
-                                xfs_extlen_t    blockcount;     /* key value */
-                                xfs_agblock_t   startblock;     /* key value */
-                                XFS_STATS_INC(xs_abt_compare);
-                                /*
-                                 * keyno is average of low and high.
-                                 */
-                                keyno = (low + high) >> 1;
-                                /*
-                                 * Get startblock & blockcount.
-                                 */
-                                if (level > 0) {
-                                        xfs_alloc_key_t *kkp;
-                                        kkp = kkbase + keyno - 1;
-                                        startblock = be32_to_cpu(kkp->ar_startblock);
-                                        blockcount = be32_to_cpu(kkp->ar_blockcount);
-                                } else {
-                                        xfs_alloc_rec_t *krp;
-                                        krp = krbase + keyno - 1;
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
-                                        startblock = be32_to_cpu(krp->ar_startblock);
+        ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
-                                        blockcount = be32_to_cpu(krp->ar_blockcount);
-                                }
-                                /*
-                                 * Compute difference to get next direction.
-                                 */
-                                if (cur->bc_btnum == XFS_BTNUM_BNO)
-                                        diff = (int)startblock -
-                                               (int)cur->bc_rec.a.ar_startblock;
-                                else if (!(diff = (int)blockcount -
-                                            (int)cur->bc_rec.a.ar_blockcount))
-                                        diff = (int)startblock -
-                                            (int)cur->bc_rec.a.ar_startblock;
-                                /*
-                                 * Less than, move right.
-                                 */
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                /*
-                                 * Greater than, move left.
-                                 */
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                /*
-                                 * Equal, we're done.
-                                 */
-                                else
-                                        break;
-                        }
-                }
-                /*
-                 * If there are more levels, set up for the next level
-                 * by getting the block number and filling in the cursor.
-                 */
-                if (level > 0) {
-                        /*
-                         * If we moved left, need the previous key number,
-                         * unless there isn't one.
-                         */
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                                return error;
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        /*
-         * Done with the search.
-         * See if we need to adjust the results.
-         */
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE &&
-                    keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                        int     i;
-                        cur->bc_ptrs[0] = keyno;
+        ptr->s = agf->agf_roots[cur->bc_btnum];
-                        if ((error = xfs_alloc_increment(cur, 0, &i)))
-                                return error;
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        /*
-         * Return if we succeeded or not.
-         */
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-                *stat = 0;
-        else
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        return 0;
 }
-/*
+STATIC __int64_t
- * Move 1 record left from cur/level if possible.
+xfs_allocbt_key_diff(
- * Update cur to reflect the new path.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key)
-STATIC int                              /* error */
-xfs_alloc_lshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
-#ifdef DEBUG
+        xfs_alloc_key_t         *kp = &key->alloc;
-        int                     i;      /* loop index */
+        __int64_t               diff;
-#endif
-        xfs_alloc_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-        xfs_alloc_block_t       *left;  /* left neighbor btree block */
-        int                     nrec;   /* new number of left block entries */
-        xfs_buf_t               *rbp;   /* buffer for right (current) block */
-        xfs_alloc_block_t       *right; /* right (current) btree block */
-        xfs_alloc_key_t         *rkp=NULL;      /* key pointer for right block */
-        xfs_alloc_ptr_t         *rpp=NULL;      /* address pointer for right block */
-        xfs_alloc_rec_t         *rrp=NULL;      /* record pointer for right block */
-        /*
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-         * Set up variables for this block as "right".
+                return (__int64_t)be32_to_cpu(kp->ar_startblock) -
-         */
+                                rec->ar_startblock;
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-#endif
-        /*
-         * If we've got no left sibling then we can't shift an entry left.
-         */
-        if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] <= 1) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the left neighbor as "left".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                        0, &lbp, XFS_ALLOC_BTREE_REF)))
-                return error;
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
        }
-        nrec = be16_to_cpu(left->bb_numrecs) + 1;
-        /*
-         * If non-leaf, copy a key and a ptr to the left block.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* key pointer for left block */
-                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-                lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
+        diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+        if (diff)
-                *lkp = *rkp;
+                return diff;
-                xfs_alloc_log_keys(cur, lbp, nrec, nrec);
-                lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                        return error;
-#endif
-                *lpp = *rpp;
-                xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
-                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-        }
-        /*
-         * If leaf, copy a record to the left block.
-         */
-        else {
-                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
-                lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
+        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_alloc_log_recs(cur, lbp, nrec, nrec);
-                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-        }
-        /*
-         * Bump and log left's numrecs, decrement and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, 1);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, -1);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Slide the contents of right down one entry.
-         */
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                        level)))
-                                return error;
-                }
-#endif
-                memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-        } else {
-                memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                key.ar_startblock = rrp->ar_startblock;
-                key.ar_blockcount = rrp->ar_blockcount;
-                rkp = &key;
-        }
-        /*
-         * Update the parent key values of right.
-         */
-        if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
-                return error;
-        /*
-         * Slide the cursor value left one.
-         */
-        cur->bc_ptrs[level]--;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Allocate a new root block, fill it in.
+xfs_allocbt_kill_root(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        struct xfs_buf          *bp,
-xfs_alloc_newroot(
+        int                     level,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_ptr     *newroot)
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        int                     error;
-        xfs_agblock_t           lbno;   /* left block number */
-        xfs_buf_t               *lbp;   /* left btree buffer */
-        xfs_alloc_block_t       *left;  /* left btree block */
-        xfs_mount_t             *mp;    /* mount structure */
-        xfs_agblock_t           nbno;   /* new block number */
-        xfs_buf_t               *nbp;   /* new (root) buffer */
-        xfs_alloc_block_t       *new;   /* new (root) btree block */
-        int                     nptr;   /* new value for key index, 1 or 2 */
-        xfs_agblock_t           rbno;   /* right block number */
-        xfs_buf_t               *rbp;   /* right btree buffer */
-        xfs_alloc_block_t       *right; /* right btree block */
-        mp = cur->bc_mp;
-        ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        /*
+        XFS_BTREE_STATS_INC(cur, killroot);
-         * Get a buffer from the freelist blocks, for the new root.
-         */
-        error = xfs_alloc_get_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, &nbno, 1);
-        if (error)
-                return error;
-        /*
-         * None available, we fail.
-         */
-        if (nbno == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-        nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
-                0);
-        new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
-        /*
-         * Set the root data in the a.g. freespace structure.
-         */
-        {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                xfs_agnumber_t  seqno;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
-                be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
-                seqno = be32_to_cpu(agf->agf_seqno);
-                mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-        }
        /*
-         * At the previous root level there are now two blocks: the old
+         * Update the root pointer, decreasing the level by 1 and then
-         * root, and the new block generated when it was split.
+         * free the old root.
-         * We don't know which one the cursor is pointing at, so we
-         * set up variables "left" and "right" for each case.
         */
-        lbp = cur->bc_bufs[cur->bc_nlevels - 1];
+        xfs_allocbt_set_root(cur, newroot, -1);
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+        error = xfs_allocbt_free_block(cur, bp);
-#ifdef DEBUG
+        if (error) {
-        if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-#endif
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                /*
-                 * Our block is left, pick up the right block.
-                 */
-                lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
-                rbno = be32_to_cpu(left->bb_rightsib);
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-                if ((error = xfs_btree_check_sblock(cur, right,
-                                cur->bc_nlevels - 1, rbp)))
-                        return error;
-                nptr = 1;
-        } else {
-                /*
-                 * Our block is right, pick up the left block.
-                 */
-                rbp = lbp;
-                right = left;
-                rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
-                lbno = be32_to_cpu(right->bb_leftsib);
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-                if ((error = xfs_btree_check_sblock(cur, left,
-                                cur->bc_nlevels - 1, lbp)))
-                        return error;
-                nptr = 2;
        }
-        /*
-         * Fill in the new block's btree header and log it.
-         */
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        new->bb_level = cpu_to_be16(cur->bc_nlevels);
-        new->bb_numrecs = cpu_to_be16(2);
-        new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-        new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-        xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
-        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-        /*
-         * Fill in the key data in the new root.
-         */
-        {
-                xfs_alloc_key_t         *kp;    /* btree key pointer */
-                kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
+        XFS_BTREE_STATS_INC(cur, free);
-                if (be16_to_cpu(left->bb_level) > 0) {
-                        kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
-                        kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                } else {
-                        xfs_alloc_rec_t *rp;    /* btree record pointer */
-                        rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
+        xfs_btree_setbuf(cur, level, NULL);
-                        kp[0].ar_startblock = rp->ar_startblock;
+        cur->bc_nlevels--;
-                        kp[0].ar_blockcount = rp->ar_blockcount;
-                        rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                        kp[1].ar_startblock = rp->ar_startblock;
-                        kp[1].ar_blockcount = rp->ar_blockcount;
-                }
-        }
-        xfs_alloc_log_keys(cur, nbp, 1, 2);
-        /*
-         * Fill in the pointer data in the new root.
-         */
-        {
-                xfs_alloc_ptr_t         *pp;    /* btree address pointer */
-                pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                pp[0] = cpu_to_be32(lbno);
-                pp[1] = cpu_to_be32(rbno);
-        }
-        xfs_alloc_log_ptrs(cur, nbp, 1, 2);
-        /*
-         * Fix up the cursor.
-         */
-        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-        cur->bc_ptrs[cur->bc_nlevels] = nptr;
-        cur->bc_nlevels++;
-        *stat = 1;
        return 0;
 }
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                              /* error */
-xfs_alloc_rshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
-{
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left (current) block */
-        xfs_alloc_block_t       *left;  /* left (current) btree block */
-        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-        xfs_alloc_block_t       *right; /* right neighbor btree block */
-        xfs_alloc_key_t         *rkp;   /* key pointer for right block */
-        xfs_btree_cur_t         *tcur;  /* temporary cursor */
-        /*
-         * Set up variables for this block as "left".
-         */
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * If we've got no right sibling then we can't shift an entry right.
-         */
-        if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the right neighbor as "right".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                        0, &rbp, XFS_ALLOC_BTREE_REF)))
-                return error;
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Make a hole at the start of the right neighbor block, then
-         * copy the last left block entry to the hole.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* key pointer for left block */
-                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-                xfs_alloc_ptr_t *rpp;   /* address pointer for right block */
-                lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
 #ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
+STATIC int
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
+xfs_allocbt_keys_inorder(
-                                return error;
+        struct xfs_btree_cur    *cur,
-                }
+        union xfs_btree_key     *k1,
-#endif
+        union xfs_btree_key     *k2)
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
+{
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-#ifdef DEBUG
+                return be32_to_cpu(k1->alloc.ar_startblock) <
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
+                       be32_to_cpu(k2->alloc.ar_startblock);
-                        return error;
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
        } else {
-                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
+                return be32_to_cpu(k1->alloc.ar_blockcount) <
-                xfs_alloc_rec_t *rrp;   /* record pointer for right block */
+                        be32_to_cpu(k2->alloc.ar_blockcount) ||
+                        (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
-                lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
+                         be32_to_cpu(k1->alloc.ar_startblock) <
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                         be32_to_cpu(k2->alloc.ar_startblock));
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.ar_startblock = rrp->ar_startblock;
-                key.ar_blockcount = rrp->ar_blockcount;
-                rkp = &key;
-                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
        }
-        /*
-         * Decrement and log left's numrecs, bump and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Using a temporary cursor, update the parent key values of the
-         * block on the right.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_alloc_increment(tcur, level, &i)) ||
-            (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
-                goto error0;
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        *stat = 1;
-        return 0;
-error0:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
 }
-/*
+STATIC int
- * Split cur/level block in half.
+xfs_allocbt_recs_inorder(
- * Return new block number and its first record (to be inserted into parent).
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_rec     *r1,
-STATIC int                              /* error */
+        union xfs_btree_rec     *r2)
-xfs_alloc_split(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to split */
-        xfs_agblock_t           *bnop,  /* output: block number allocated */
-        xfs_alloc_key_t         *keyp,  /* output: first key of new block */
-        xfs_btree_cur_t         **curp, /* output: new cursor */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-        int                     i;      /* loop index/record number */
+                return be32_to_cpu(r1->alloc.ar_startblock) +
-        xfs_agblock_t           lbno;   /* left (current) block number */
+                        be32_to_cpu(r1->alloc.ar_blockcount) <=
-        xfs_buf_t               *lbp;   /* buffer for left block */
+                        be32_to_cpu(r2->alloc.ar_startblock);
-        xfs_alloc_block_t       *left;  /* left (current) btree block */
+        } else {
-        xfs_agblock_t           rbno;   /* right (new) block number */
+                return be32_to_cpu(r1->alloc.ar_blockcount) <
-        xfs_buf_t               *rbp;   /* buffer for right block */
+                        be32_to_cpu(r2->alloc.ar_blockcount) ||
-        xfs_alloc_block_t       *right; /* right (new) btree block */
+                        (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+                         be32_to_cpu(r1->alloc.ar_startblock) <
-        /*
+                         be32_to_cpu(r2->alloc.ar_startblock));
-         * Allocate the new block from the freelist.
-         * If we can't do it, we're toast.  Give up.
-         */
-        error = xfs_alloc_get_freelist(cur->bc_tp,
-                                         cur->bc_private.a.agbp, &rbno, 1);
-        if (error)
-                return error;
-        if (rbno == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-        rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
-                rbno, 0);
-        /*
-         * Set up the new block as "right".
-         */
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-        /*
-         * "Left" is the current (according to the cursor) block.
-         */
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * Fill in the btree header for the new block.
-         */
-        right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        right->bb_level = left->bb_level;
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        /*
-         * Make sure that if there's an odd number of entries now, that
-         * each new block will have the same number of entries.
-         */
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        /*
-         * For non-leaf blocks, copy keys and addresses over to the new block.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* left btree key pointer */
-                xfs_alloc_ptr_t *lpp;   /* left btree address pointer */
-                xfs_alloc_key_t *rkp;   /* right btree key pointer */
-                xfs_alloc_ptr_t *rpp;   /* right btree address pointer */
-                lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *keyp = *rkp;
        }
-        /*
+}
-         * For leaf blocks, copy records over to the new block.
+#endif  /* DEBUG */
-         */
-        else {
-                xfs_alloc_rec_t *lrp;   /* left btree record pointer */
-                xfs_alloc_rec_t *rrp;   /* right btree record pointer */
-                lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
+#ifdef XFS_BTREE_TRACE
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+ktrace_t        *xfs_allocbt_trace_buf;
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->ar_startblock = rrp->ar_startblock;
-                keyp->ar_blockcount = rrp->ar_blockcount;
-        }
-        /*
-         * Find the left block number by looking in the buffer.
-         * Adjust numrecs, sibling pointers.
-         */
-        lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be32(rbno);
-        right->bb_leftsib = cpu_to_be32(lbno);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there's a block to the new block's right, make that block
-         * point back to right instead of to left.
-         */
-        if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-                xfs_alloc_block_t       *rrblock;       /* rr btree block */
-                xfs_buf_t               *rrbp;          /* buffer for rrblock */
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+STATIC void
-                                cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
+xfs_allocbt_trace_enter(
-                                &rrbp, XFS_ALLOC_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        const char              *func,
-                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+        char                    *s,
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+        int                     type,
-                        return error;
+        int                     line,
-                rrblock->bb_leftsib = cpu_to_be32(rbno);
+        __psunsigned_t          a0,
-                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        __psunsigned_t          a1,
-        }
+        __psunsigned_t          a2,
-        /*
+        __psunsigned_t          a3,
-         * If the cursor is really in the right block, move it there.
+        __psunsigned_t          a4,
-         * If it's just pointing past the last entry in left, then we'll
+        __psunsigned_t          a5,
-         * insert there, so don't change anything in that case.
+        __psunsigned_t          a6,
-         */
+        __psunsigned_t          a7,
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
+        __psunsigned_t          a8,
-                xfs_btree_setbuf(cur, level, rbp);
+        __psunsigned_t          a9,
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
+        __psunsigned_t          a10)
-        }
+{
-        /*
+        ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
-         * If there are more levels, we'll need another cursor which refers to
+                (void *)func, (void *)s, NULL, (void *)cur,
-         * the right block, no matter where this cursor was.
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-         */
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-        if (level + 1 < cur->bc_nlevels) {
+                (void *)a8, (void *)a9, (void *)a10);
-                if ((error = xfs_btree_dup_cursor(cur, curp)))
-                        return error;
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = rbno;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Update keys at all levels from here to the root along the cursor's path.
+xfs_allocbt_trace_cursor(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        __uint32_t              *s0,
-xfs_alloc_updkey(
+        __uint64_t              *l0,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        __uint64_t              *l1)
-        xfs_alloc_key_t         *keyp,  /* new key value to update to */
-        int                     level)  /* starting level for update */
 {
-        int                     ptr;    /* index of key in block */
+        *s0 = cur->bc_private.a.agno;
+        *l0 = cur->bc_rec.a.ar_startblock;
-        /*
+        *l1 = cur->bc_rec.a.ar_blockcount;
-         * Go up the tree from this level toward the root.
-         * At each level, update the key value to the value input.
-         * Stop when we reach a level where the cursor isn't pointing
-         * at the first entry in the block.
-         */
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                xfs_alloc_block_t       *block; /* btree block */
-                xfs_buf_t               *bp;    /* buffer for block */
-#ifdef DEBUG
-                int                     error;  /* error return value */
-#endif
-                xfs_alloc_key_t         *kp;    /* ptr to btree block keys */
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                        return error;
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_alloc_log_keys(cur, bp, ptr, ptr);
-        }
-        return 0;
 }
-/*
+STATIC void
- * Externally visible routines.
+xfs_allocbt_trace_key(
- */
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
-/*
+        __uint64_t              *l0,
- * Decrement cursor by one record at the level.
+        __uint64_t              *l1)
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                     /* error */
-xfs_alloc_decrement(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level in btree, 0 is leaf */
-        int                     *stat)  /* success/failure */
 {
-        xfs_alloc_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(key->alloc.ar_startblock);
-        int                     error;  /* error return value */
+        *l1 = be32_to_cpu(key->alloc.ar_blockcount);
-        int                     lev;    /* btree level */
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the left at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        /*
-         * Decrement the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (--cur->bc_ptrs[level] > 0) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Get a pointer to the btree block.
-         */
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level,
-                        cur->bc_bufs[level])))
-                return error;
-#endif
-        /*
-         * If we just went off the left edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree decrementing pointers.
-         * Stop when we don't go off the left edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                /*
-                 * Read-ahead the left block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                xfs_buf_t       *bp;    /* buffer pointer for block */
-                agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Delete the record pointed to by cur.
+xfs_allocbt_trace_record(
- * The cursor refers to the place where the record was (could be inserted)
+        struct xfs_btree_cur    *cur,
- * when the operation returns.
+        union xfs_btree_rec     *rec,
- */
+        __uint64_t              *l0,
-int                                     /* error */
+        __uint64_t              *l1,
-xfs_alloc_delete(
+        __uint64_t              *l2)
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
 {
-        int             error;          /* error return value */
+        *l0 = be32_to_cpu(rec->alloc.ar_startblock);
-        int             i;              /* result code */
+        *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
-        int             level;          /* btree level */
+        *l2 = 0;
-        /*
-         * Go up the tree, starting at leaf level.
-         * If 2 is returned then a join was done; go to the next level.
-         * Otherwise we are done.
-         */
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_alloc_delrec(cur, level, &i)))
-                        return error;
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_alloc_decrement(cur, level, &i)))
-                                        return error;
-                                break;
-                        }
-                }
-        }
-        *stat = i;
-        return 0;
 }
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+        .rec_len                = sizeof(xfs_alloc_rec_t),
+        .key_len                = sizeof(xfs_alloc_key_t),
+        .dup_cursor             = xfs_allocbt_dup_cursor,
+        .set_root               = xfs_allocbt_set_root,
+        .kill_root              = xfs_allocbt_kill_root,
+        .alloc_block            = xfs_allocbt_alloc_block,
+        .free_block             = xfs_allocbt_free_block,
+        .update_lastrec         = xfs_allocbt_update_lastrec,
+        .get_minrecs            = xfs_allocbt_get_minrecs,
+        .get_maxrecs            = xfs_allocbt_get_maxrecs,
+        .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
+        .key_diff               = xfs_allocbt_key_diff,
-/*
- * Get the data from the pointed-to record.
- */
-int                                     /* error */
-xfs_alloc_get_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agblock_t           *bno,   /* output: starting block of extent */
-        xfs_extlen_t            *len,   /* output: length of extent */
-        int                     *stat)  /* output: success/failure */
-{
-        xfs_alloc_block_t       *block; /* btree block */
 #ifdef DEBUG
-        int                     error;  /* error return value */
+        .keys_inorder           = xfs_allocbt_keys_inorder,
+        .recs_inorder           = xfs_allocbt_recs_inorder,
 #endif
-        int                     ptr;    /* record number */
-        ptr = cur->bc_ptrs[0];
+#ifdef XFS_BTREE_TRACE
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+        .trace_enter            = xfs_allocbt_trace_enter,
-#ifdef DEBUG
+        .trace_cursor           = xfs_allocbt_trace_cursor,
-        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+        .trace_key              = xfs_allocbt_trace_key,
-                return error;
+        .trace_record           = xfs_allocbt_trace_record,
 #endif
-        /*
+};
-         * Off the right end or left end, return failure.
-         */
-        if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Point to the record and extract its data.
-         */
-        {
-                xfs_alloc_rec_t         *rec;   /* record data */
-                rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                *bno = be32_to_cpu(rec->ar_startblock);
-                *len = be32_to_cpu(rec->ar_blockcount);
-        }
-        *stat = 1;
-        return 0;
-}
 /*
- * Increment cursor by one record at the level.
+ * Allocate a new allocation btree cursor.
- * For nonzero levels the leaf-ward information is untouched.
 */
-int                                     /* error */
+struct xfs_btree_cur *                  /* new alloc btree cursor */
-xfs_alloc_increment(
+xfs_allocbt_init_cursor(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,            /* file system mount point */
-        int                     level,  /* level in btree, 0 is leaf */
+        struct xfs_trans        *tp,            /* transaction pointer */
-        int                     *stat)  /* success/failure */
+        struct xfs_buf          *agbp,          /* buffer for agf structure */
+        xfs_agnumber_t          agno,           /* allocation group number */
+        xfs_btnum_t             btnum)          /* btree identifier */
 {
-        xfs_alloc_block_t       *block; /* btree block */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-        xfs_buf_t               *bp;    /* tree block buffer */
+        struct xfs_btree_cur    *cur;
-        int                     error;  /* error return value */
-        int                     lev;    /* btree level */
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the right at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        /*
-         * Get a pointer to the btree block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Increment the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we just went off the right edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree incrementing pointers.
-         * Stop when we don't go off the right edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                bp = cur->bc_bufs[lev];
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-#endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                /*
-                 * Read-ahead the right block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-             lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
+        ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = 1;
-        }
-        *stat = 1;
-        return 0;
-}
-/*
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-int                                     /* error */
-xfs_alloc_insert(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;              /* result value, 0 for failure */
-        int             level;          /* current level number in btree */
-        xfs_agblock_t   nbno;           /* new block number (split result) */
-        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-        xfs_alloc_rec_t nrec;           /* record being inserted this level */
-        xfs_btree_cur_t *pcur;          /* previous level's cursor */
-        level = 0;
+        cur->bc_tp = tp;
-        nbno = NULLAGBLOCK;
+        cur->bc_mp = mp;
-        nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+        cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-        nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+        cur->bc_btnum = btnum;
-        ncur = NULL;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        pcur = cur;
-        /*
-         * Loop going up the tree, starting at the leaf level.
-         * Stop when we don't get a split block, that must mean that
-         * the insert is finished with this level.
-         */
-        do {
-                /*
-                 * Insert nrec/nbno into this level of the tree.
-                 * Note if we fail, nbno will be null.
-                 */
-                if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        return error;
-                }
-                /*
-                 * See if the cursor we just used is trash.
-                 * Can't trash the caller's cursor, but otherwise we should
-                 * if ncur is a new cursor or we're about to be done.
-                 */
-                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                /*
-                 * If we got a new cursor, switch to it.
-                 */
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLAGBLOCK);
-        *stat = i;
-        return 0;
-}
-/*
+        cur->bc_ops = &xfs_allocbt_ops;
- * Lookup the record equal to [bno, len] in the btree given by cur.
+        if (btnum == XFS_BTNUM_CNT)
- */
+                cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
-int                                     /* error */
-xfs_alloc_lookup_eq(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
+        cur->bc_private.a.agbp = agbp;
- * Lookup the first record greater than or equal to [bno, len]
+        cur->bc_private.a.agno = agno;
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_alloc_lookup_ge(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
+        return cur;
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_alloc_lookup_le(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
 }
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
+ * Calculate number of records in an alloc btree block.
- * This either works (return 0) or gets an EFSCORRUPTED error.
 */
-int                                     /* error */
+int
-xfs_alloc_update(
+xfs_allocbt_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,
-        xfs_agblock_t           bno,    /* starting block of extent */
+        int                     blocklen,
-        xfs_extlen_t            len)    /* length of extent */
+        int                     leaf)
 {
-        xfs_alloc_block_t       *block; /* btree block to update */
+        blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
-        int                     error;  /* error return value */
-        int                     ptr;    /* current record number (updating) */
-        ASSERT(len > 0);
+        if (leaf)
-        /*
+                return blocklen / sizeof(xfs_alloc_rec_t);
-         * Pick up the a.g. freelist struct and the current block.
+        return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
-         */
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-                return error;
-#endif
-        /*
-         * Get the address of the rec to be updated.
-         */
-        ptr = cur->bc_ptrs[0];
-        {
-                xfs_alloc_rec_t         *rp;    /* pointer to updated record */
-                rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                /*
-                 * Fill in the new contents and log them.
-                 */
-                rp->ar_startblock = cpu_to_be32(bno);
-                rp->ar_blockcount = cpu_to_be32(len);
-                xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
-        }
-        /*
-         * If it's the by-size btree and it's the last leaf block and
-         * it's the last record... then update the size of the longest
-         * extent in the a.g., which we cache in the a.g. freelist header.
-         */
-        if (cur->bc_btnum == XFS_BTNUM_CNT &&
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-            ptr == be16_to_cpu(block->bb_numrecs)) {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                xfs_agnumber_t  seqno;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                seqno = be32_to_cpu(agf->agf_seqno);
-                cur->bc_mp->m_perag[seqno].pagf_longest = len;
-                agf->agf_longest = cpu_to_be32(len);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
-        }
-        /*
-         * Updating first record in leaf. Pass new key value up to our parent.
-         */
-        if (ptr == 1) {
-                xfs_alloc_key_t key;    /* key containing [bno, len] */
-                key.ar_startblock = cpu_to_be32(bno);
-                key.ar_blockcount = cpu_to_be32(len);
-                if ((error = xfs_alloc_updkey(cur, &key, 1)))
-                        return error;
-        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 5bd1a2c8bd0..a6caa0022c9 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -24,7 +24,6 @@
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 /*
@@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore {
 /* btree pointer type */
 typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_alloc_block_t;
-#define XFS_BUF_TO_ALLOC_BLOCK(bp)      ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
 /*
 * Minimum and maximum blocksize and sectorsize.
@@ -83,73 +72,39 @@ typedef	struct xfs_btree_sblock xfs_alloc_block_t;
 #define XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
 /*
- * Record, key, and pointer address macros for btree blocks.
+ * Btree block header size depends on a superblock flag.
- */
+ *
-#define XFS_ALLOC_REC_ADDR(bb,i,cur)    \
+ * (not quite yet, but soon)
-        XFS_BTREE_REC_ADDR(xfs_alloc, bb, i)
-#define XFS_ALLOC_KEY_ADDR(bb,i,cur)    \
-        XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i)
-#define XFS_ALLOC_PTR_ADDR(bb,i,cur)    \
-        XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno,
-                                xfs_extlen_t *len, int *stat);
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                                xfs_extlen_t len, int *stat);
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                                xfs_extlen_t len, int *stat);
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
 */
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
-                                xfs_extlen_t len, int *stat);
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
+ * Record, key, and pointer address macros for btree blocks.
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ *
- */
+ * (note that some of these may appear unused, but they are used in userspace)
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ */
-                                xfs_extlen_t len);
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+        ((xfs_alloc_rec_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+        ((xfs_alloc_key_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_alloc_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+                 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *,
+                xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 #endif  /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 0b3b5efe848..53d5e70d136 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -41,21 +41,36 @@
 #endif
 #ifdef XFS_NATIVE_HOST
-#define cpu_to_be16(val)        ((__be16)(val))
+#define cpu_to_be16(val)        ((__force __be16)(__u16)(val))
-#define cpu_to_be32(val)        ((__be32)(val))
+#define cpu_to_be32(val)        ((__force __be32)(__u32)(val))
-#define cpu_to_be64(val)        ((__be64)(val))
+#define cpu_to_be64(val)        ((__force __be64)(__u64)(val))
-#define be16_to_cpu(val)        ((__uint16_t)(val))
+#define be16_to_cpu(val)        ((__force __u16)(__be16)(val))
-#define be32_to_cpu(val)        ((__uint32_t)(val))
+#define be32_to_cpu(val)        ((__force __u32)(__be32)(val))
-#define be64_to_cpu(val)        ((__uint64_t)(val))
+#define be64_to_cpu(val)        ((__force __u64)(__be64)(val))
 #else
-#define cpu_to_be16(val)        (__swab16((__uint16_t)(val)))
+#define cpu_to_be16(val)        ((__force __be16)__swab16((__u16)(val)))
-#define cpu_to_be32(val)        (__swab32((__uint32_t)(val)))
+#define cpu_to_be32(val)        ((__force __be32)__swab32((__u32)(val)))
-#define cpu_to_be64(val)        (__swab64((__uint64_t)(val)))
+#define cpu_to_be64(val)        ((__force __be64)__swab64((__u64)(val)))
-#define be16_to_cpu(val)        (__swab16((__be16)(val)))
+#define be16_to_cpu(val)        (__swab16((__force __u16)(__be16)(val)))
-#define be32_to_cpu(val)        (__swab32((__be32)(val)))
+#define be32_to_cpu(val)        (__swab32((__force __u32)(__be32)(val)))
-#define be64_to_cpu(val)        (__swab64((__be64)(val)))
+#define be64_to_cpu(val)        (__swab64((__force __u64)(__be64)(val)))
 #endif
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+        *a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+        *a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+        *a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
 #endif  /* __KERNEL__ */
 /* do we need conversion? */
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 8e0e463dae2..bca7b243c31 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v)
 /* Get low bit set out of 32-bit argument, -1 if none set */
 static inline int xfs_lowbit32(__uint32_t v)
 {
-        unsigned long   t = v;
+        return ffs(v) - 1;
-        return (v) ? find_first_bit(&t, 32) : -1;
 }
 /* Get low bit set out of 64-bit argument, -1 if none set */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a1aab9275d5..138308e70d1 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -393,8 +393,8 @@ xfs_bmap_count_leaves(
 STATIC void
 xfs_bmap_disk_count_leaves(
-        xfs_extnum_t            idx,
+        struct xfs_mount        *mp,
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count);
@@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves(
 * Bmap internal routines.
 */
+STATIC int                              /* error */
+xfs_bmbt_lookup_eq(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+STATIC int                              /* error */
+xfs_bmbt_lookup_ge(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        xfs_exntst_t            state)
+{
+        union xfs_btree_rec     rec;
+        xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+        return xfs_btree_update(cur, &rec);
+}
 /*
 * Called from xfs_bmap_add_attrfork to handle btree format files.
 */
@@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree(
        if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
                *flags |= XFS_ILOG_DBROOT;
        else {
-                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
-                        XFS_DATA_FORK);
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.firstblock = *firstblock;
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
                /* must be at least one entry */
                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
-                if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+                if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
@@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        if (xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
@@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real(
                                oldext)))
                                goto done;
                        cur->bc_rec.b = *new;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                        if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto done;
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
@@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = PREV;
                        cur->bc_rec.b.br_blockcount =
                                new->br_startoff - PREV.br_startoff;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        /*
@@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real(
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        /* new middle extent - newext */
                        cur->bc_rec.b.br_state = new->br_state;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
@@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = new->br_state;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents(
        int                     whichfork)  /* data or attr fork */
 {
        /* REFERENCED */
-        xfs_bmbt_block_t        *cblock;/* child btree block */
+        struct xfs_btree_block  *cblock;/* child btree block */
        xfs_fsblock_t           cbno;   /* child block number */
        xfs_buf_t               *cbp;   /* child block's buffer */
        int                     error;  /* error return value */
        xfs_ifork_t             *ifp;   /* inode fork data */
        xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
-        xfs_bmbt_block_t        *rblock;/* root btree block */
+        struct xfs_btree_block  *rblock;/* root btree block */
+        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
        rblock = ifp->if_broot;
        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-        ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
+        ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
-        mp = ip->i_mount;
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
        cbno = be64_to_cpu(*pp);
        *logflagsp = 0;
 #ifdef DEBUG
@@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents(
        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
                        XFS_BMAP_BTREE_REF)))
                return error;
-        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+        cblock = XFS_BUF_TO_BLOCK(cbp);
-        if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
        ip->i_d.di_nblocks--;
@@ -3170,7 +3216,7 @@ xfs_bmap_del_extent(
                        flags |= XFS_ILOG_FEXT(whichfork);
                        break;
                }
-                if ((error = xfs_bmbt_delete(cur, &i)))
+                if ((error = xfs_btree_delete(cur, &i)))
                        goto done;
                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                break;
@@ -3254,10 +3300,10 @@ xfs_bmap_del_extent(
                                                got.br_startblock, temp,
                                                got.br_state)))
                                        goto done;
-                                if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto done;
                                cur->bc_rec.b = new;
-                                error = xfs_bmbt_insert(cur, &i);
+                                error = xfs_btree_insert(cur, &i);
                                if (error && error != ENOSPC)
                                        goto done;
                                /*
@@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree(
        int                     *logflagsp,     /* inode logging flags */
        int                     whichfork)      /* data or attr fork */
 {
-        xfs_bmbt_block_t        *ablock;        /* allocated (child) bt block */
+        struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
        xfs_buf_t               *abp;           /* buffer for ablock */
        xfs_alloc_arg_t         args;           /* allocation arguments */
        xfs_bmbt_rec_t          *arp;           /* child record pointer */
-        xfs_bmbt_block_t        *block;         /* btree root block */
+        struct xfs_btree_block  *block;         /* btree root block */
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
        xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
        int                     error;          /* error return value */
@@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree(
         */
        xfs_iroot_realloc(ip, 1, whichfork);
        ifp->if_flags |= XFS_IFBROOT;
        /*
         * Fill in the root.
         */
@@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree(
        block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        block->bb_level = cpu_to_be16(1);
        block->bb_numrecs = cpu_to_be16(1);
-        block->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        block->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
        /*
         * Need a cursor.  Can't allocate until bb_level is filled in.
         */
        mp = ip->i_mount;
-        cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+        cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-                whichfork);
        cur->bc_private.b.firstblock = *firstblock;
        cur->bc_private.b.flist = flist;
        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
@@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree(
        /*
         * Fill in the child block.
         */
-        ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+        ablock = XFS_BUF_TO_BLOCK(abp);
        ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        ablock->bb_level = 0;
-        ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        for (cnt = i = 0; i < nextents; i++) {
                ep = xfs_iext_get_ext(ifp, i);
@@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree(
                }
        }
        ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-        ablock->bb_numrecs = cpu_to_be16(cnt);
+        xfs_btree_set_numrecs(ablock, cnt);
        /*
         * Fill in the root key and pointer.
         */
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+        kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+        pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+                                                be16_to_cpu(block->bb_level)));
        *pp = cpu_to_be64(args.fsbno);
        /*
         * Do all this logging at the end so that
         * the root is at the right level.
         */
-        xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
+        xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+        xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
        ASSERT(*curp == NULL);
        *curp = cur;
        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
@@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels(
                maxleafents = MAXAEXTNUM;
                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
        }
-        maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+        maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
        minnoderecs = mp->m_bmap_dmnr[1];
        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -4242,9 +4292,15 @@ xfs_bmap_finish(
         * We have a new transaction, so we should return committed=1,
         * even though we're returning an error.
         */
-        if (error) {
+        if (error)
                return error;
-        }
+        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(ntp->t_ticket);
        if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
                        logcount)))
                return error;
@@ -4474,6 +4530,22 @@ xfs_bmap_one_block(
        return rval;
 }
+STATIC int
+xfs_bmap_sanity_check(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp,
+        int                     level)
+{
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+            be16_to_cpu(block->bb_level) != level ||
+            be16_to_cpu(block->bb_numrecs) == 0 ||
+            be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+                return 0;
+        return 1;
+}
 /*
 * Read in the extents to if_extents.
 * All inode fields are set up by caller, we just traverse the btree
@@ -4486,7 +4558,7 @@ xfs_bmap_read_extents(
        xfs_inode_t             *ip,    /* incore inode */
        int                     whichfork) /* data or attr fork */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -4510,7 +4582,7 @@ xfs_bmap_read_extents(
         */
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -4523,13 +4595,13 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                xfs_trans_brelse(tp, bp);
@@ -4549,7 +4621,7 @@ xfs_bmap_read_extents(
                xfs_extnum_t    start;
-                num_recs = be16_to_cpu(block->bb_numrecs);
+                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
@@ -4561,18 +4633,18 @@ xfs_bmap_read_extents(
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, 0),
+                        xfs_bmap_sanity_check(mp, bp, 0),
                        error0);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                if (nextbno != NULLFSBLOCK)
                        xfs_btree_reada_bufl(mp, nextbno, 1);
                /*
                 * Copy records into the extent records.
                 */
-                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                frp = XFS_BMBT_REC_ADDR(mp, block, 1);
                start = i;
                for (j = 0; j < num_recs; j++, i++, frp++) {
                        xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
@@ -4603,7 +4675,7 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
        }
        ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
@@ -5029,8 +5101,7 @@ xfs_bmapi(
                                if (abno == NULLFSBLOCK)
                                        break;
                                if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                        cur = xfs_btree_init_cursor(mp,
+                                        cur = xfs_bmbt_init_cursor(mp, tp,
-                                                tp, NULL, 0, XFS_BTNUM_BMAP,
                                                ip, whichfork);
                                        cur->bc_private.b.firstblock =
                                                *firstblock;
@@ -5147,9 +5218,8 @@ xfs_bmapi(
                         */
                        ASSERT(mval->br_blockcount <= len);
                        if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                cur = xfs_btree_init_cursor(mp,
+                                cur = xfs_bmbt_init_cursor(mp,
-                                        tp, NULL, 0, XFS_BTNUM_BMAP,
+                                        tp, ip, whichfork);
-                                        ip, whichfork);
                                cur->bc_private.b.firstblock =
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
@@ -5440,8 +5510,7 @@ xfs_bunmapi(
        logflags = 0;
        if (ifp->if_flags & XFS_IFBROOT) {
                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-                        whichfork);
                cur->bc_private.b.firstblock = *firstblock;
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.flags = 0;
@@ -5742,14 +5811,17 @@ error0:
 STATIC int
 xfs_getbmapx_fix_eof_hole(
        xfs_inode_t             *ip,            /* xfs incore inode pointer */
-        struct getbmap          *out,           /* output structure */
+        struct getbmapx         *out,           /* output structure */
        int                     prealloced,     /* this is a file with
-                                                * preallocated data space */
+                                                 * preallocated data space */
        __int64_t               end,            /* last block requested */
        xfs_fsblock_t           startblock)
 {
        __int64_t               fixlen;
        xfs_mount_t             *mp;            /* file system mount point */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        xfs_extnum_t            lastx;          /* last extent pointer */
+        xfs_fileoff_t           fileblock;
        if (startblock == HOLESTARTBLOCK) {
                mp = ip->i_mount;
@@ -5763,21 +5835,33 @@ xfs_getbmapx_fix_eof_hole(
                        out->bmv_length = fixlen;
                }
        } else {
-                out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+                if (startblock == DELAYSTARTBLOCK)
+                        out->bmv_block = -2;
+                else
+                        out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+                fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+                ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+                if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+                   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+                        out->bmv_oflags |= BMV_OF_LAST;
        }
        return 1;
 }
 /*
- * Fcntl interface to xfs_bmapi.
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
 */
 int                                             /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-        struct getbmap          *bmv,           /* user bmap structure */
+        struct getbmapx         *bmv,           /* user bmap structure */
-        void                    __user *ap,     /* pointer to user's array */
+        xfs_bmap_format_t       formatter,      /* format to user */
-        int                     interface)      /* interface flags */
+        void                    *arg)           /* formatter arg */
 {
        __int64_t               bmvend;         /* last block requested */
        int                     error;          /* return value */
@@ -5790,19 +5874,17 @@ xfs_getbmap(
        int                     nexleft;        /* # of user extents left */
        int                     subnex;         /* # of bmapi's can do */
        int                     nmap;           /* number of map entries */
-        struct getbmap          out;            /* output structure */
+        struct getbmapx         out;            /* output structure */
        int                     whichfork;      /* data or attr fork */
        int                     prealloced;     /* this is a file with
                                                 * preallocated data space */
-        int                     sh_unwritten;   /* true, if unwritten */
+        int                     iflags;         /* interface flags */
-                                                /* extents listed separately */
        int                     bmapi_flags;    /* flags for xfs_bmapi */
-        __int32_t               oflags;         /* getbmapx bmv_oflags field */
        mp = ip->i_mount;
+        iflags = bmv->bmv_iflags;
-        whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+        whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-        sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
        /*      If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
         *      generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
@@ -5817,7 +5899,7 @@ xfs_getbmap(
         *      could misinterpret holes in a DMAPI file as true holes,
         *      when in fact they may represent offline user data.
         */
-        if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
+        if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
            whichfork == XFS_DATA_FORK) {
                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
@@ -5873,8 +5955,9 @@ xfs_getbmap(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (whichfork == XFS_DATA_FORK &&
+        if (((iflags & BMV_IF_DELALLOC) == 0) &&
-                (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
+            (whichfork == XFS_DATA_FORK) &&
+            (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
                error = xfs_flush_pages(ip, (xfs_off_t)0,
                                               -1, 0, FI_REMAPF);
@@ -5884,7 +5967,8 @@ xfs_getbmap(
                }
        }
-        ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+        ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
+               ip->i_delayed_blks == 0);
        lock = xfs_ilock_map_shared(ip);
@@ -5896,7 +5980,7 @@ xfs_getbmap(
                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
        bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
-                        ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+                        ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
        /*
         * Allocate enough space to handle "subnex" maps at a time.
@@ -5906,9 +5990,12 @@ xfs_getbmap(
        bmv->bmv_entries = 0;
-        if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
+        if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
-                error = 0;
+                if (((iflags & BMV_IF_DELALLOC) == 0) ||
-                goto unlock_and_return;
+                    whichfork == XFS_ATTR_FORK) {
+                        error = 0;
+                        goto unlock_and_return;
+                }
        }
        nexleft = nex;
@@ -5924,52 +6011,40 @@ xfs_getbmap(
                ASSERT(nmap <= subnex);
                for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-                        nexleft--;
+                        out.bmv_oflags = 0;
-                        oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
+                        if (map[i].br_state == XFS_EXT_UNWRITTEN)
-                                        BMV_OF_PREALLOC : 0;
+                                out.bmv_oflags |= BMV_OF_PREALLOC;
+                        else if (map[i].br_startblock == DELAYSTARTBLOCK)
+                                out.bmv_oflags |= BMV_OF_DELALLOC;
                        out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
                        out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-                        ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+                        out.bmv_unused1 = out.bmv_unused2 = 0;
+                        ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
+                              (map[i].br_startblock != DELAYSTARTBLOCK));
                        if (map[i].br_startblock == HOLESTARTBLOCK &&
                            whichfork == XFS_ATTR_FORK) {
                                /* came to the end of attribute fork */
+                                out.bmv_oflags |= BMV_OF_LAST;
                                goto unlock_and_return;
                        } else {
+                                int full = 0;   /* user array is full */
                                if (!xfs_getbmapx_fix_eof_hole(ip, &out,
                                                        prealloced, bmvend,
                                                        map[i].br_startblock)) {
                                        goto unlock_and_return;
                                }
-                                /* return either getbmap/getbmapx structure. */
+                                /* format results & advance arg */
-                                if (interface & BMV_IF_EXTENDED) {
+                                error = formatter(&arg, &out, &full);
-                                        struct  getbmapx        outx;
+                                if (error || full)
+                                        goto unlock_and_return;
-                                        GETBMAP_CONVERT(out,outx);
+                                nexleft--;
-                                        outx.bmv_oflags = oflags;
-                                        outx.bmv_unused1 = outx.bmv_unused2 = 0;
-                                        if (copy_to_user(ap, &outx,
-                                                        sizeof(outx))) {
-                                                error = XFS_ERROR(EFAULT);
-                                                goto unlock_and_return;
-                                        }
-                                } else {
-                                        if (copy_to_user(ap, &out,
-                                                        sizeof(out))) {
-                                                error = XFS_ERROR(EFAULT);
-                                                goto unlock_and_return;
-                                        }
-                                }
                                bmv->bmv_offset =
                                        out.bmv_offset + out.bmv_length;
                                bmv->bmv_length = MAX((__int64_t)0,
                                        (__int64_t)(bmvend - bmv->bmv_offset));
                                bmv->bmv_entries++;
-                                ap = (interface & BMV_IF_EXTENDED) ?
-                                                (void __user *)
-                                        ((struct getbmapx __user *)ap + 1) :
-                                                (void __user *)
-                                        ((struct getbmap __user *)ap + 1);
                        }
                }
        } while (nmap && nexleft && bmv->bmv_length);
@@ -6131,7 +6206,7 @@ xfs_bmap_get_bp(
 void
 xfs_check_block(
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        xfs_mount_t             *mp,
        int                     root,
        short                   sz)
@@ -6143,36 +6218,29 @@ xfs_check_block(
        ASSERT(be16_to_cpu(block->bb_level) > 0);
        prevp = NULL;
-        for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) {
+        for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
                dmxr = mp->m_bmap_dmxr[0];
+                keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-                if (root) {
-                        keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
-                } else {
-                        keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i);
-                }
                if (prevp) {
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+                        ASSERT(be64_to_cpu(prevp->br_startoff) <
+                               be64_to_cpu(keyp->br_startoff));
                }
                prevp = keyp;
                /*
                 * Compare the block numbers to see if there are dups.
                 */
+                if (root)
+                        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+                else
+                        pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-                if (root) {
-                        pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
-                } else {
-                        pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr);
-                }
                for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-                        if (root) {
+                        if (root)
-                                thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
+                                thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
-                        } else {
+                        else
-                                thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j,
+                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
-                                                            dmxr);
-                        }
                        if (*thispa == *pp) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
@@ -6195,7 +6263,7 @@ xfs_bmap_check_leaf_extents(
        xfs_inode_t             *ip,            /* incore inode pointer */
        int                     whichfork)      /* data or attr fork */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -6223,7 +6291,7 @@ xfs_bmap_check_leaf_extents(
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
@@ -6245,9 +6313,9 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
@@ -6258,7 +6326,7 @@ xfs_bmap_check_leaf_extents(
                 */
                xfs_check_block(block, mp, 0, 0);
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
@@ -6280,13 +6348,13 @@ xfs_bmap_check_leaf_extents(
                xfs_extnum_t    num_recs;
-                num_recs = be16_to_cpu(block->bb_numrecs);
+                num_recs = xfs_btree_get_numrecs(block);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                /*
                 * Check all the extents to make sure they are OK.
@@ -6294,13 +6362,17 @@ xfs_bmap_check_leaf_extents(
                 * conform with the first entry in this one.
                 */
-                ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                ep = XFS_BMBT_REC_ADDR(mp, block, 1);
                if (i) {
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+                        ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+                               xfs_bmbt_disk_get_blockcount(&last) <=
+                               xfs_bmbt_disk_get_startoff(ep));
                }
                for (j = 1; j < num_recs; j++) {
-                        nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
+                        nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
+                        ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+                               xfs_bmbt_disk_get_blockcount(ep) <=
+                               xfs_bmbt_disk_get_startoff(nextp));
                        ep = nextp;
                }
@@ -6326,7 +6398,7 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
        }
        if (bp_release) {
                bp_release = 0;
@@ -6356,7 +6428,7 @@ xfs_bmap_count_blocks(
        int                     whichfork,      /* data or attr fork */
        int                     *count)         /* out: count of blocks */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
@@ -6379,7 +6451,7 @@ xfs_bmap_count_blocks(
        block = ifp->if_broot;
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -6413,29 +6485,29 @@ xfs_bmap_count_tree(
        __be64                  *pp;
        xfs_fsblock_t           bno = blockno;
        xfs_fsblock_t           nextbno;
-        xfs_bmbt_block_t        *block, *nextblock;
+        struct xfs_btree_block  *block, *nextblock;
        int                     numrecs;
        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
                return error;
        *count += 1;
-        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+        block = XFS_BUF_TO_BLOCK(bp);
        if (--level) {
                /* Not at node above leafs, count this level of nodes */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
                                0, &nbp, XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                        nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
+                        nextblock = XFS_BUF_TO_BLOCK(nbp);
-                        nextbno = be64_to_cpu(nextblock->bb_rightsib);
+                        nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
                        xfs_trans_brelse(tp, nbp);
                }
                /* Dive to the next level */
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                if (unlikely((error =
                     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
@@ -6448,9 +6520,9 @@ xfs_bmap_count_tree(
        } else {
                /* count all level 1 nodes and their leaves */
                for (;;) {
-                        nextbno = be64_to_cpu(block->bb_rightsib);
+                        nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                        xfs_bmap_disk_count_leaves(0, block, numrecs, count);
+                        xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6459,7 +6531,7 @@ xfs_bmap_count_tree(
                                XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                        block = XFS_BUF_TO_BLOCK(bp);
                }
        }
        return 0;
@@ -6489,8 +6561,8 @@ xfs_bmap_count_leaves(
 */
 STATIC void
 xfs_bmap_disk_count_leaves(
-        xfs_extnum_t            idx,
+        struct xfs_mount        *mp,
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count)
 {
@@ -6498,7 +6570,7 @@ xfs_bmap_disk_count_leaves(
        xfs_bmbt_rec_t  *frp;
        for (b = 1; b <= numrecs; b++) {
-                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
+                frp = XFS_BMBT_REC_ADDR(mp, block, b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 9f3e3a836d1..284571c05ed 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -137,9 +137,7 @@ typedef struct xfs_bmalloca {
        char                    conv;   /* overwriting unwritten extents */
 } xfs_bmalloca_t;
-#ifdef __KERNEL__
+#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
-#if defined(XFS_BMAP_TRACE)
 /*
 * Trace operations for bmap extent tracing
 */
@@ -163,9 +161,12 @@ xfs_bmap_trace_exlist(
        int                     whichfork);     /* data or attr fork */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
        xfs_bmap_trace_exlist(__func__,ip,c,w)
-#else
+#else   /* __KERNEL__ && XFS_BMAP_TRACE */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
+#endif  /* __KERNEL__ && XFS_BMAP_TRACE */
 /*
 * Convert inode from non-attributed to attributed.
@@ -206,20 +207,6 @@ xfs_bmap_compute_maxlevels(
        int                     whichfork);     /* data or attr fork */
 /*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int                                             /* error */
-xfs_bmap_finish(
-        struct xfs_trans        **tp,           /* transaction pointer addr */
-        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        int                     *committed);    /* xact committed or not */
-/*
 * Returns the file-relative block number of the first unused block in the file.
 * This is the lowest-address hole if the file has holes, else the first block
 * past the end of file.
@@ -344,14 +331,43 @@ xfs_bunmapi(
        int                     *done);         /* set if not done yet */
 /*
- * Fcntl interface to xfs_bmapi.
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+        struct xfs_ifork        *ifp,
+        xfs_extnum_t            idx,
+        xfs_extnum_t            num);
+#ifdef __KERNEL__
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int                                             /* error */
+xfs_bmap_finish(
+        struct xfs_trans        **tp,           /* transaction pointer addr */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        int                     *committed);    /* xact committed or not */
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+/*
+ * Get inode's extents as described in bmv, and format for output.
 */
 int                                             /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-        struct getbmap          *bmv,           /* user bmap structure */
+        struct getbmapx         *bmv,           /* user bmap structure */
-        void                    __user *ap,     /* pointer to user's array */
+        xfs_bmap_format_t       formatter,      /* format to user */
-        int                     iflags);        /* interface flags */
+        void                    *arg);          /* formatter arg */
 /*
 * Check if the endoff is outside the last extent. If so the caller will grow
@@ -375,16 +391,6 @@ xfs_bmap_count_blocks(
        int                     *count);
 /*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-        struct xfs_ifork        *ifp,
-        xfs_extnum_t            idx,
-        xfs_extnum_t            num);
-/*
 * Search the extent records for the entry containing block bno.
 * If bno lies in a hole, point to the next entry.  If bno lies
 * past eof, *eofp will be set, and *prevp will contain the last
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 23efad29a5c..8f1ec73725d 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -37,1406 +37,13 @@
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#if defined(XFS_BMBT_TRACE)
-ktrace_t        *xfs_bmbt_trace_buf;
-#endif
-/*
- * Prototypes for internal btree functions.
- */
-STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
-STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
-                __uint64_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
-#if defined(XFS_BMBT_TRACE)
-static char     ARGS[] = "args";
-static char     ENTRY[] = "entry";
-static char     ERROR[] = "error";
-#undef EXIT
-static char     EXIT[] = "exit";
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-STATIC void
-xfs_bmbt_trace_enter(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        char            *s,
-        int             type,
-        int             line,
-        __psunsigned_t  a0,
-        __psunsigned_t  a1,
-        __psunsigned_t  a2,
-        __psunsigned_t  a3,
-        __psunsigned_t  a4,
-        __psunsigned_t  a5,
-        __psunsigned_t  a6,
-        __psunsigned_t  a7,
-        __psunsigned_t  a8,
-        __psunsigned_t  a9,
-        __psunsigned_t  a10)
-{
-        xfs_inode_t     *ip;
-        int             whichfork;
-        ip = cur->bc_private.b.ip;
-        whichfork = cur->bc_private.b.whichfork;
-        ktrace_enter(xfs_bmbt_trace_buf,
-                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-                (void *)func, (void *)s, (void *)ip, (void *)cur,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)a8, (void *)a9, (void *)a10);
-        ASSERT(ip->i_btrace);
-        ktrace_enter(ip->i_btrace,
-                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-                (void *)func, (void *)s, (void *)ip, (void *)cur,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)a8, (void *)a9, (void *)a10);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argbi(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *b,
-        int             i,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
-                (__psunsigned_t)b, i, 0, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
- */
-STATIC void
-xfs_bmbt_trace_argbii(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *b,
-        int             i0,
-        int             i1,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
-                (__psunsigned_t)b, i0, i1, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for 3 block-length args
- * and an integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argfffi(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        xfs_dfiloff_t           o,
-        xfs_dfsbno_t            b,
-        xfs_dfilblks_t          i,
-        int                     j,
-        int                     line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
-                o >> 32, (int)o, b >> 32, (int)b,
-                i >> 32, (int)i, (int)j, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for one integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argi(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        int             i,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
-                i, 0, 0, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, key.
- */
-STATIC void
-xfs_bmbt_trace_argifk(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_fsblock_t           f,
-        xfs_dfiloff_t           o,
-        int                     line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-                i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
-                (int)o, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, rec.
- */
-STATIC void
-xfs_bmbt_trace_argifr(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_fsblock_t           f,
-        xfs_bmbt_rec_t          *r,
-        int                     line)
-{
-        xfs_dfsbno_t            b;
-        xfs_dfilblks_t          c;
-        xfs_dfsbno_t            d;
-        xfs_dfiloff_t           o;
-        xfs_bmbt_irec_t         s;
-        d = (xfs_dfsbno_t)f;
-        xfs_bmbt_disk_get_all(r, &s);
-        o = (xfs_dfiloff_t)s.br_startoff;
-        b = (xfs_dfsbno_t)s.br_startblock;
-        c = s.br_blockcount;
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
-                i, d >> 32, (int)d, o >> 32,
-                (int)o, b >> 32, (int)b, c >> 32,
-                (int)c, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, key.
- */
-STATIC void
-xfs_bmbt_trace_argik(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_bmbt_key_t          *k,
-        int                     line)
-{
-        xfs_dfiloff_t           o;
-        o = be64_to_cpu(k->br_startoff);
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-                i, o >> 32, (int)o, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for the cursor/operation.
- */
-STATIC void
-xfs_bmbt_trace_cursor(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        char            *s,
-        int             line)
-{
-        xfs_bmbt_rec_host_t     r;
-        xfs_bmbt_set_all(&r, &cur->bc_rec.b);
-        xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
-                (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
-                cur->bc_private.b.allocated,
-                r.l0 >> 32, (int)r.l0,
-                r.l1 >> 32, (int)r.l1,
-                (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
-                (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
-                (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
-                (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
-}
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)     \
-        xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
-        xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
-        xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGI(c,i)        \
-        xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-        xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
-        xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)     \
-        xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
-#define XFS_BMBT_TRACE_CURSOR(c,s)      \
-        xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
-#else
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
-#define XFS_BMBT_TRACE_ARGI(c,i)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)
-#define XFS_BMBT_TRACE_CURSOR(c,s)
-#endif  /* XFS_BMBT_TRACE */
-/*
- * Internal functions.
- */
-/*
- * Delete record pointed to by cur/level.
- */
-STATIC int                                      /* error */
-xfs_bmbt_delrec(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_fsblock_t           bno;            /* fs-relative block number */
-        xfs_buf_t               *bp;            /* buffer for block */
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        int                     j;              /* temp state */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-        xfs_fsblock_t           lbno;           /* left sibling block number */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        int                     lrecs=0;        /* left record count */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        int                     ptr;            /* key/record index */
-        xfs_fsblock_t           rbno;           /* right sibling block number */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_rec_t          *rp;            /* pointer to bmap btree rec */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-        int                     rrecs=0;        /* right record count */
-        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-        xfs_btree_cur_t         *tcur;          /* temporary btree cursor */
-        int                     numrecs;        /* temporary numrec count */
-        int                     numlrecs, numrrecs;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ptr = cur->bc_ptrs[level];
-        tcur = NULL;
-        if (ptr == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-#endif
-        if (ptr > numrecs) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_bmbt_delrec);
-        if (level > 0) {
-                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&kp[ptr - 1], &kp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        memmove(&pp[ptr - 1], &pp[ptr],
-                                (numrecs - ptr) * sizeof(*pp));
-                        xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                        xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
-                }
-        } else {
-                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&rp[ptr - 1], &rp[ptr],
-                                (numrecs - ptr) * sizeof(*rp));
-                        xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                if (ptr == 1) {
-                        key.br_startoff =
-                                cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
-                        kp = &key;
-                }
-        }
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-        /*
-         * We're at the root level.
-         * First, shrink the root block in-memory.
-         * Try to get rid of the next level down.
-         * If we can't then there's nothing left to do.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                xfs_iroot_realloc(cur->bc_private.b.ip, -1,
-                        cur->bc_private.b.whichfork);
-                if ((error = xfs_bmbt_killroot(cur))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        rbno = be64_to_cpu(block->bb_rightsib);
-        lbno = be64_to_cpu(block->bb_leftsib);
-        /*
-         * One child of root, need to get a chance to copy its contents
-         * into the root and delete it. Can't go up to next level,
-         * there's nothing to delete there.
-         */
-        if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
-            level == cur->bc_nlevels - 2) {
-                if ((error = xfs_bmbt_killroot(cur))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
-        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        bno = NULLFSBLOCK;
-        if (rbno != NULLFSBLOCK) {
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-#endif
-                bno = be64_to_cpu(right->bb_leftsib);
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                        if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                                tcur = NULL;
-                                if (level > 0) {
-                                        if ((error = xfs_bmbt_decrement(cur,
-                                                        level, &i))) {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        ERROR);
-                                                goto error0;
-                                        }
-                                }
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLFSBLOCK) {
-                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                }
-        }
-        if (lbno != NULLFSBLOCK) {
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * decrement to last in block
-                 */
-                if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-#endif
-                bno = be64_to_cpu(left->bb_rightsib);
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                        if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                                tcur = NULL;
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        tcur = NULL;
-        mp = cur->bc_mp;
-        ASSERT(bno != NULLFSBLOCK);
-        if (lbno != NULLFSBLOCK &&
-            lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                rbno = bno;
-                right = block;
-                rbp = bp;
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-        } else if (rbno != NULLFSBLOCK &&
-                   rrecs + be16_to_cpu(block->bb_numrecs) <=
-                   XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                lbno = bno;
-                left = block;
-                lbp = bp;
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        } else {
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        numlrecs = be16_to_cpu(left->bb_numrecs);
-        numrrecs = be16_to_cpu(right->bb_numrecs);
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < numrrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                }
-#endif
-                memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
-                xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-                xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
-                xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-        }
-        be16_add_cpu(&left->bb_numrecs, numrrecs);
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
-        if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
-                                be64_to_cpu(left->bb_rightsib),
-                                0, &rrbp, XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                rrblock->bb_leftsib = cpu_to_be64(lbno);
-                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-        }
-        xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
-                cur->bc_private.b.flist, mp);
-        cur->bc_private.b.ip->i_d.di_nblocks--;
-        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
-                        XFS_TRANS_DQ_BCOUNT, -1L);
-        xfs_trans_binval(cur->bc_tp, rbp);
-        if (bp != lbp) {
-                cur->bc_bufs[level] = lbp;
-                cur->bc_ptrs[level] += lrecs;
-                cur->bc_ra[level] = 0;
-        } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 2;
-        return 0;
-error0:
-        if (tcur)
-                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
-}
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                                      /* error */
-xfs_bmbt_insrec(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_fsblock_t           *bnop,
-        xfs_bmbt_rec_t          *recp,
-        xfs_btree_cur_t         **curp,
-        int                     *stat)          /* no-go/done/continue */
-{
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_buf_t               *bp;            /* buffer for block */
-        int                     error;          /* error return value */
-        int                     i;              /* loop index */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-        int                     logflags;       /* inode logging flags */
-        xfs_fsblock_t           nbno;           /* new block number */
-        struct xfs_btree_cur    *ncur;          /* new btree cursor */
-        __uint64_t              startoff;       /* new btree key value */
-        xfs_bmbt_rec_t          nrec;           /* new record count */
-        int                     optr;           /* old key/record index */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        int                     ptr;            /* key/record index */
-        xfs_bmbt_rec_t          *rp=NULL;       /* pointer to bmap btree rec */
-        int                     numrecs;
-        ASSERT(level < cur->bc_nlevels);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
-        ncur = NULL;
-        key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
-        optr = ptr = cur->bc_ptrs[level];
-        if (ptr == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_bmbt_insrec);
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
-                } else {
-                        kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLFSBLOCK;
-        if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                        /*
-                         * A root block, that can be made bigger.
-                         */
-                        xfs_iroot_realloc(cur->bc_private.b.ip, 1,
-                                cur->bc_private.b.whichfork);
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                } else if (level == cur->bc_nlevels - 1) {
-                        if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
-                            *stat == 0) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                                logflags);
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                } else {
-                        if ((error = xfs_bmbt_rshift(cur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        if (i) {
-                                /* nothing */
-                        } else {
-                                if ((error = xfs_bmbt_lshift(cur, level, &i))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                if (i) {
-                                        optr = ptr = cur->bc_ptrs[level];
-                                } else {
-                                        if ((error = xfs_bmbt_split(cur, level,
-                                                        &nbno, &startoff, &ncur,
-                                                        &i))) {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        ERROR);
-                                                return error;
-                                        }
-                                        if (i) {
-                                                block = xfs_bmbt_get_block(
-                                                            cur, level, &bp);
-#ifdef DEBUG
-                                                if ((error =
-                                                    xfs_btree_check_lblock(cur,
-                                                            block, level, bp))) {
-                                                        XFS_BMBT_TRACE_CURSOR(
-                                                                cur, ERROR);
-                                                        return error;
-                                                }
-#endif
-                                                ptr = cur->bc_ptrs[level];
-                                                xfs_bmbt_disk_set_allf(&nrec,
-                                                        startoff, 0, 0,
-                                                        XFS_EXT_NORM);
-                                        } else {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        EXIT);
-                                                *stat = 0;
-                                                return 0;
-                                        }
-                                }
-                        }
-                }
-        }
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
-                                        level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be64(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
-                xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
-        } else {
-                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
-        }
-        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (ptr < numrecs) {
-                if (level == 0)
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
-                                rp + ptr);
-                else
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
-                                kp + ptr);
-        }
-#endif
-        if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        *bnop = nbno;
-        if (nbno != NULLFSBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-STATIC int
-xfs_bmbt_killroot(
-        xfs_btree_cur_t         *cur)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_bmbt_block_t        *cblock;
-        xfs_buf_t               *cbp;
-        xfs_bmbt_key_t          *ckp;
-        xfs_bmbt_ptr_t          *cpp;
-#ifdef DEBUG
-        int                     error;
-#endif
-        int                     i;
-        xfs_bmbt_key_t          *kp;
-        xfs_inode_t             *ip;
-        xfs_ifork_t             *ifp;
-        int                     level;
-        xfs_bmbt_ptr_t          *pp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = cur->bc_nlevels - 1;
-        ASSERT(level >= 1);
-        /*
-         * Don't deal with the root block needs to be a leaf case.
-         * We're just going to turn the thing back into extents anyway.
-         */
-        if (level == 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &cbp);
-        /*
-         * Give up if the root has multiple children.
-         */
-        if (be16_to_cpu(block->bb_numrecs) != 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        /*
-         * Only do this if the next level will fit.
-         * Then the data must be copied up to the inode,
-         * instead of freeing the root you free the next level.
-         */
-        cbp = cur->bc_bufs[level - 1];
-        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
-        if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
-        ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
-        ip = cur->bc_private.b.ip;
-        ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
-        ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
-               XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
-        i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
-        if (i) {
-                xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
-                block = ifp->if_broot;
-        }
-        be16_add_cpu(&block->bb_numrecs, i);
-        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-        memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-#endif
-        memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
-        xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
-                        cur->bc_private.b.flist, cur->bc_mp);
-        ip->i_d.di_nblocks--;
-        XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
-                        XFS_TRANS_DQ_BCOUNT, -1L);
-        xfs_trans_binval(cur->bc_tp, cbp);
-        cur->bc_bufs[level - 1] = NULL;
-        be16_add_cpu(&block->bb_level, -1);
-        xfs_trans_log_inode(cur->bc_tp, ip,
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        cur->bc_nlevels--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_keys(
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *bp,
-        int             kfirst,
-        int             klast)
-{
-        xfs_trans_t     *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_bmbt_block_t        *block;
-                int                     first;
-                xfs_bmbt_key_t          *kp;
-                int                     last;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
-                first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-                last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else {
-                xfs_inode_t              *ip;
-                ip = cur->bc_private.b.ip;
-                xfs_trans_log_inode(tp, ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Log pointer values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_ptrs(
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *bp,
-        int             pfirst,
-        int             plast)
-{
-        xfs_trans_t     *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_bmbt_block_t        *block;
-                int                     first;
-                int                     last;
-                xfs_bmbt_ptr_t          *pp;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
-                first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-                last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else {
-                xfs_inode_t             *ip;
-                ip = cur->bc_private.b.ip;
-                xfs_trans_log_inode(tp, ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- */
-STATIC int                              /* error */
-xfs_bmbt_lookup(
-        xfs_btree_cur_t         *cur,
-        xfs_lookup_t            dir,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block=NULL;
-        xfs_buf_t               *bp;
-        xfs_daddr_t             d;
-        xfs_sfiloff_t           diff;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno=0;
-        int                     high;
-        int                     i;
-        int                     keyno=0;
-        xfs_bmbt_key_t          *kkbase=NULL;
-        xfs_bmbt_key_t          *kkp;
-        xfs_bmbt_rec_t          *krbase=NULL;
-        xfs_bmbt_rec_t          *krp;
-        int                     level;
-        int                     low;
-        xfs_mount_t             *mp;
-        xfs_bmbt_ptr_t          *pp;
-        xfs_bmbt_irec_t         *rp;
-        xfs_fileoff_t           startoff;
-        xfs_trans_t             *tp;
-        XFS_STATS_INC(xs_bmbt_lookup);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, (int)dir);
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        rp = &cur->bc_rec.b;
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                if (level < cur->bc_nlevels - 1) {
-                        d = XFS_FSB_TO_DADDR(mp, fsbno);
-                        bp = cur->bc_bufs[level];
-                        if (bp && XFS_BUF_ADDR(bp) != d)
-                                bp = NULL;
-                        if (!bp) {
-                                if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
-                                                0, &bp, XFS_BMAP_BTREE_REF))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                xfs_btree_setbuf(cur, level, bp);
-                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                                if ((error = xfs_btree_check_lblock(cur, block,
-                                                level, bp))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                        } else
-                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                } else
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                if (diff == 0)
-                        keyno = 1;
-                else {
-                        if (level > 0)
-                                kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                        else
-                                krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                ASSERT(level == 0);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 0;
-                                return 0;
-                        }
-                        while (low <= high) {
-                                XFS_STATS_INC(xs_bmbt_compare);
-                                keyno = (low + high) >> 1;
-                                if (level > 0) {
-                                        kkp = kkbase + keyno - 1;
-                                        startoff = be64_to_cpu(kkp->br_startoff);
-                                } else {
-                                        krp = krbase + keyno - 1;
-                                        startoff = xfs_bmbt_disk_get_startoff(krp);
-                                }
-                                diff = (xfs_sfiloff_t)
-                                                (startoff - rp->br_startoff);
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                else
-                                        break;
-                        }
-                }
-                if (level > 0) {
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
-                        fsbno = be64_to_cpu(*pp);
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
-                        cur->bc_ptrs[0] = keyno;
-                        if ((error = xfs_bmbt_increment(cur, 0, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
-                        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-        } else {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        }
-        return 0;
-}
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                      /* error */
-xfs_bmbt_lshift(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        int                     error;          /* error return value */
-#ifdef DEBUG
-        int                     i;              /* loop counter */
-#endif
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp=NULL;      /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        int                     lrecs;          /* left record count */
-        xfs_bmbt_rec_t          *lrp=NULL;      /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp=NULL;      /* right btree key */
-        xfs_bmbt_ptr_t          *rpp=NULL;      /* right address pointer */
-        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-        int                     rrecs;          /* right record count */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        if (level == cur->bc_nlevels - 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (cur->bc_ptrs[level] <= 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        mp = cur->bc_mp;
-        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
-                        &lbp, XFS_BMAP_BTREE_REF))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        lrecs = be16_to_cpu(left->bb_numrecs) + 1;
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                *lkp = *rkp;
-                xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
-                lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                *lpp = *rpp;
-                xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
-        }
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
-        else
-                xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
-#endif
-        rrecs = be16_to_cpu(right->bb_numrecs) - 1;
-        right->bb_numrecs = cpu_to_be16(rrecs);
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
-                                        level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
-                memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
-                xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
-                xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
-        } else {
-                memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
-                xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
-                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                rkp = &key;
-        }
-        if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        cur->bc_ptrs[level]--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                      /* error */
-xfs_bmbt_rshift(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        if (level == cur->bc_nlevels - 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        mp = cur->bc_mp;
-        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
-                        &rbp, XFS_BMAP_BTREE_REF))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                rkp = &key;
-        }
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
-        else
-                xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
-#endif
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-                goto error1;
-        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-                goto error1;
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-error0:
-        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-error1:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
-}
 /*
 * Determine the extent state.
 */
@@ -1453,229 +60,15 @@ xfs_extent_state(
        return XFS_EXT_NORM;
 }
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                                      /* error */
-xfs_bmbt_split(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_fsblock_t           *bnop,
-        __uint64_t              *startoff,
-        xfs_btree_cur_t         **curp,
-        int                     *stat)          /* success/failure */
-{
-        xfs_alloc_arg_t         args;           /* block allocation args */
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        xfs_fsblock_t           lbno;           /* left sibling block number */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
-        args.tp = cur->bc_tp;
-        args.mp = cur->bc_mp;
-        lbp = cur->bc_bufs[level];
-        lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-        args.fsbno = cur->bc_private.b.firstblock;
-        args.firstblock = args.fsbno;
-        args.minleft = 0;
-        if (args.fsbno == NULLFSBLOCK) {
-                args.fsbno = lbno;
-                args.type = XFS_ALLOCTYPE_START_BNO;
-                /*
-                 * Make sure there is sufficient room left in the AG to
-                 * complete a full tree split for an extent insert.  If
-                 * we are converting the middle part of an extent then
-                 * we may need space for two tree splits.
-                 *
-                 * We are relying on the caller to make the correct block
-                 * reservation for this operation to succeed.  If the
-                 * reservation amount is insufficient then we may fail a
-                 * block allocation here and corrupt the filesystem.
-                 */
-                args.minleft = xfs_trans_get_block_res(args.tp);
-        } else if (cur->bc_private.b.flist->xbf_low)
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        else
-                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        args.mod = args.alignment = args.total = args.isfl =
-                args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return XFS_ERROR(ENOSPC);
-        }
-        if ((error = xfs_alloc_vextent(&args))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (args.fsbno == NULLFSBLOCK && args.minleft) {
-                /*
-                 * Could not find an AG with enough free space to satisfy
-                 * a full btree split.  Try again without minleft and if
-                 * successful activate the lowspace algorithm.
-                 */
-                args.fsbno = 0;
-                args.type = XFS_ALLOCTYPE_FIRST_AG;
-                args.minleft = 0;
-                if ((error = xfs_alloc_vextent(&args))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_private.b.flist->xbf_low = 1;
-        }
-        if (args.fsbno == NULLFSBLOCK) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        cur->bc_private.b.firstblock = args.fsbno;
-        cur->bc_private.b.allocated++;
-        cur->bc_private.b.ip->i_d.di_nblocks++;
-        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                        XFS_TRANS_DQ_BCOUNT, 1L);
-        rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
-        right->bb_level = left->bb_level;
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *startoff = be64_to_cpu(rkp->br_startoff);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, i, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *startoff = xfs_bmbt_disk_get_startoff(rrp);
-        }
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be64(args.fsbno);
-        right->bb_leftsib = cpu_to_be64(lbno);
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
-                if ((error = xfs_btree_read_bufl(args.mp, args.tp,
-                                be64_to_cpu(right->bb_rightsib), 0, &rrbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
-                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-        }
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-                xfs_btree_setbuf(cur, level, rbp);
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-        }
-        if (level + 1 < cur->bc_nlevels) {
-                if ((error = xfs_btree_dup_cursor(cur, curp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = args.fsbno;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Update keys for the record.
- */
-STATIC int
-xfs_bmbt_updkey(
-        xfs_btree_cur_t         *cur,
-        xfs_bmbt_key_t          *keyp,  /* on-disk format */
-        int                     level)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-#ifdef DEBUG
-        int                     error;
-#endif
-        xfs_bmbt_key_t          *kp;
-        int                     ptr;
-        ASSERT(level >= 1);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_bmbt_log_keys(cur, bp, ptr, ptr);
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
 /*
 * Convert on-disk form of btree root to in-memory form.
 */
 void
 xfs_bmdr_to_bmbt(
+        struct xfs_mount        *mp,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen,
-        xfs_bmbt_block_t        *rblock,
+        struct xfs_btree_block  *rblock,
        int                     rblocklen)
 {
        int                     dmxr;
@@ -1688,129 +81,19 @@ xfs_bmdr_to_bmbt(
        rblock->bb_level = dblock->bb_level;
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        rblock->bb_numrecs = dblock->bb_numrecs;
-        rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
-        fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
+        fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-        tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-        fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+        fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-        tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                             /* error */
-xfs_bmbt_decrement(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno;
-        int                     lev;
-        xfs_mount_t             *mp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ASSERT(level < cur->bc_nlevels);
-        if (level < cur->bc_nlevels - 1)
-                xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        if (--cur->bc_ptrs[level] > 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                if (lev < cur->bc_nlevels - 1)
-                        xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        if (lev == cur->bc_nlevels) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Delete the record pointed to by cur.
- */
-int                                     /* error */
-xfs_bmbt_delete(
-        xfs_btree_cur_t *cur,
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;
-        int             level;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_bmbt_delrec(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_bmbt_decrement(cur, level,
-                                                &i))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                break;
-                        }
-                }
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = i;
-        return 0;
-}
-/*
 * Convert a compressed bmap extent record to an uncompressed form.
 * This code must be in sync with the routines xfs_bmbt_get_startoff,
 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
@@ -1864,31 +147,6 @@ xfs_bmbt_get_all(
 }
 /*
- * Get the block pointer for the given level of the cursor.
- * Fill in the buffer pointer, if applicable.
- */
-xfs_bmbt_block_t *
-xfs_bmbt_get_block(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_buf_t               **bpp)
-{
-        xfs_ifork_t             *ifp;
-        xfs_bmbt_block_t        *rval;
-        if (level < cur->bc_nlevels - 1) {
-                *bpp = cur->bc_bufs[level];
-                rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
-        } else {
-                *bpp = NULL;
-                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-                        cur->bc_private.b.whichfork);
-                rval = ifp->if_broot;
-        }
-        return rval;
-}
-/*
 * Extract the blockcount field from an in memory bmap extent record.
 */
 xfs_filblks_t
@@ -1950,7 +208,8 @@ xfs_bmbt_disk_get_all(
        xfs_bmbt_rec_t  *r,
        xfs_bmbt_irec_t *s)
 {
-        __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s);
+        __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+                                get_unaligned_be64(&r->l1), s);
 }
 /*
@@ -1974,348 +233,6 @@ xfs_bmbt_disk_get_startoff(
                 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                             /* error */
-xfs_bmbt_increment(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno;
-        int                     lev;
-        xfs_mount_t             *mp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ASSERT(level < cur->bc_nlevels);
-        if (level < cur->bc_nlevels - 1)
-                xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                block = xfs_bmbt_get_block(cur, lev, &bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                if (lev < cur->bc_nlevels - 1)
-                        xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        if (lev == cur->bc_nlevels) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_ptrs[lev] = 1;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Insert the current record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor.  All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int                                     /* error */
-xfs_bmbt_insert(
-        xfs_btree_cur_t *cur,
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;
-        int             level;
-        xfs_fsblock_t   nbno;
-        xfs_btree_cur_t *ncur;
-        xfs_bmbt_rec_t  nrec;
-        xfs_btree_cur_t *pcur;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = 0;
-        nbno = NULLFSBLOCK;
-        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
-        ncur = NULL;
-        pcur = cur;
-        do {
-                if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        cur->bc_private.b.allocated +=
-                                pcur->bc_private.b.allocated;
-                        pcur->bc_private.b.allocated = 0;
-                        ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
-                               XFS_IS_REALTIME_INODE(cur->bc_private.b.ip));
-                        cur->bc_private.b.firstblock =
-                                pcur->bc_private.b.firstblock;
-                        ASSERT(cur->bc_private.b.flist ==
-                               pcur->bc_private.b.flist);
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLFSBLOCK);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = i;
-        return 0;
-error0:
-        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-        return error;
-}
-/*
- * Log fields from the btree block header.
- */
-void
-xfs_bmbt_log_block(
-        xfs_btree_cur_t         *cur,
-        xfs_buf_t               *bp,
-        int                     fields)
-{
-        int                     first;
-        int                     last;
-        xfs_trans_t             *tp;
-        static const short      offsets[] = {
-                offsetof(xfs_bmbt_block_t, bb_magic),
-                offsetof(xfs_bmbt_block_t, bb_level),
-                offsetof(xfs_bmbt_block_t, bb_numrecs),
-                offsetof(xfs_bmbt_block_t, bb_leftsib),
-                offsetof(xfs_bmbt_block_t, bb_rightsib),
-                sizeof(xfs_bmbt_block_t)
-        };
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
-                                  &last);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else
-                xfs_trans_log_inode(tp, cur->bc_private.b.ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Log record values from the btree block.
- */
-void
-xfs_bmbt_log_recs(
-        xfs_btree_cur_t         *cur,
-        xfs_buf_t               *bp,
-        int                     rfirst,
-        int                     rlast)
-{
-        xfs_bmbt_block_t        *block;
-        int                     first;
-        int                     last;
-        xfs_bmbt_rec_t          *rp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
-        ASSERT(bp);
-        tp = cur->bc_tp;
-        block = XFS_BUF_TO_BMBT_BLOCK(bp);
-        rp = XFS_BMAP_REC_DADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(tp, bp, first, last);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-int                                     /* error */
-xfs_bmbt_lookup_eq(
-        xfs_btree_cur_t *cur,
-        xfs_fileoff_t   off,
-        xfs_fsblock_t   bno,
-        xfs_filblks_t   len,
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.b.br_startoff = off;
-        cur->bc_rec.b.br_startblock = bno;
-        cur->bc_rec.b.br_blockcount = len;
-        return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-int                                     /* error */
-xfs_bmbt_lookup_ge(
-        xfs_btree_cur_t *cur,
-        xfs_fileoff_t   off,
-        xfs_fsblock_t   bno,
-        xfs_filblks_t   len,
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.b.br_startoff = off;
-        cur->bc_rec.b.br_startblock = bno;
-        cur->bc_rec.b.br_blockcount = len;
-        return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-int                                             /* error */
-xfs_bmbt_newroot(
-        xfs_btree_cur_t         *cur,           /* btree cursor */
-        int                     *logflags,      /* logging flags for inode */
-        int                     *stat)          /* return status - 0 fail */
-{
-        xfs_alloc_arg_t         args;           /* allocation arguments */
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_buf_t               *bp;            /* buffer for block */
-        xfs_bmbt_block_t        *cblock;        /* child btree block */
-        xfs_bmbt_key_t          *ckp;           /* child key pointer */
-        xfs_bmbt_ptr_t          *cpp;           /* child ptr pointer */
-        int                     error;          /* error return code */
-#ifdef DEBUG
-        int                     i;              /* loop counter */
-#endif
-        xfs_bmbt_key_t          *kp;            /* pointer to bmap btree key */
-        int                     level;          /* btree level */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = cur->bc_nlevels - 1;
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        /*
-         * Copy the root into a real block.
-         */
-        args.mp = cur->bc_mp;
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-        args.tp = cur->bc_tp;
-        args.fsbno = cur->bc_private.b.firstblock;
-        args.mod = args.minleft = args.alignment = args.total = args.isfl =
-                args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-        args.firstblock = args.fsbno;
-        if (args.fsbno == NULLFSBLOCK) {
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                args.fsbno = be64_to_cpu(*pp);
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        } else if (cur->bc_private.b.flist->xbf_low)
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        else
-                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (args.fsbno == NULLFSBLOCK) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        cur->bc_private.b.firstblock = args.fsbno;
-        cur->bc_private.b.allocated++;
-        cur->bc_private.b.ip->i_d.di_nblocks++;
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                          XFS_TRANS_DQ_BCOUNT, 1L);
-        bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
-        cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
-        *cblock = *block;
-        be16_add_cpu(&block->bb_level, 1);
-        block->bb_numrecs = cpu_to_be16(1);
-        cur->bc_nlevels++;
-        cur->bc_ptrs[level + 1] = 1;
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-        memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
-        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-#endif
-        memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        *pp = cpu_to_be64(args.fsbno);
-        xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
-                cur->bc_private.b.whichfork);
-        xfs_btree_setbuf(cur, level, bp);
-        /*
-         * Do all this logging at the end so that
-         * the root is at the right level.
-         */
-        xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-        xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *logflags |=
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
-        *stat = 1;
-        return 0;
-}
 /*
 * Set all the fields in a bmap extent record from the arguments.
@@ -2512,7 +429,8 @@ xfs_bmbt_set_state(
 */
 void
 xfs_bmbt_to_bmdr(
-        xfs_bmbt_block_t        *rblock,
+        struct xfs_mount        *mp,
+        struct xfs_btree_block  *rblock,
        int                     rblocklen,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen)
@@ -2524,67 +442,22 @@ xfs_bmbt_to_bmdr(
        __be64                  *tpp;
        ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
-        ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
+        ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
-        ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO);
+        ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        dblock->bb_level = rblock->bb_level;
        dblock->bb_numrecs = rblock->bb_numrecs;
-        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
-        fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-        tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
+        tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-        fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-        tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+        tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
- * Update the record to the passed values.
- */
-int
-xfs_bmbt_update(
-        xfs_btree_cur_t         *cur,
-        xfs_fileoff_t           off,
-        xfs_fsblock_t           bno,
-        xfs_filblks_t           len,
-        xfs_exntst_t            state)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;
-        xfs_bmbt_key_t          key;
-        int                     ptr;
-        xfs_bmbt_rec_t          *rp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
-                (xfs_dfilblks_t)len, (int)state);
-        block = xfs_bmbt_get_block(cur, 0, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        ptr = cur->bc_ptrs[0];
-        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-        xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
-        xfs_bmbt_log_recs(cur, bp, ptr, ptr);
-        if (ptr > 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        key.br_startoff = cpu_to_be64(off);
-        if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
-/*
 * Check extent records, which have just been read, for
 * any bit in the extent flag field. ASSERT on debug
 * kernels, as this condition should not occur.
@@ -2608,3 +481,451 @@ xfs_check_nostate_extents(
        }
        return 0;
 }
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+        struct xfs_btree_cur    *cur)
+{
+        struct xfs_btree_cur    *new;
+        new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+        /*
+         * Copy the firstblock, flist, and flags values,
+         * since init cursor doesn't get them.
+         */
+        new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+        new->bc_private.b.flist = cur->bc_private.b.flist;
+        new->bc_private.b.flags = cur->bc_private.b.flags;
+        return new;
+}
+STATIC void
+xfs_bmbt_update_cursor(
+        struct xfs_btree_cur    *src,
+        struct xfs_btree_cur    *dst)
+{
+        ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+               (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+        ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+        dst->bc_private.b.allocated += src->bc_private.b.allocated;
+        dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+        src->bc_private.b.allocated = 0;
+}
+STATIC int
+xfs_bmbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     length,
+        int                     *stat)
+{
+        xfs_alloc_arg_t         args;           /* block allocation args */
+        int                     error;          /* error return value */
+        memset(&args, 0, sizeof(args));
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        args.fsbno = cur->bc_private.b.firstblock;
+        args.firstblock = args.fsbno;
+        if (args.fsbno == NULLFSBLOCK) {
+                args.fsbno = be64_to_cpu(start->l);
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                /*
+                 * Make sure there is sufficient room left in the AG to
+                 * complete a full tree split for an extent insert.  If
+                 * we are converting the middle part of an extent then
+                 * we may need space for two tree splits.
+                 *
+                 * We are relying on the caller to make the correct block
+                 * reservation for this operation to succeed.  If the
+                 * reservation amount is insufficient then we may fail a
+                 * block allocation here and corrupt the filesystem.
+                 */
+                args.minleft = xfs_trans_get_block_res(args.tp);
+        } else if (cur->bc_private.b.flist->xbf_low) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        } else {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        }
+        args.minlen = args.maxlen = args.prod = 1;
+        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+                error = XFS_ERROR(ENOSPC);
+                goto error0;
+        }
+        error = xfs_alloc_vextent(&args);
+        if (error)
+                goto error0;
+        if (args.fsbno == NULLFSBLOCK && args.minleft) {
+                /*
+                 * Could not find an AG with enough free space to satisfy
+                 * a full btree split.  Try again without minleft and if
+                 * successful activate the lowspace algorithm.
+                 */
+                args.fsbno = 0;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+                args.minleft = 0;
+                error = xfs_alloc_vextent(&args);
+                if (error)
+                        goto error0;
+                cur->bc_private.b.flist->xbf_low = 1;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        cur->bc_private.b.ip->i_d.di_nblocks++;
+        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+                        XFS_TRANS_DQ_BCOUNT, 1L);
+        new->l = cpu_to_be64(args.fsbno);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+ error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_bmbt_free_block(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_trans        *tp = cur->bc_tp;
+        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+        xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+        ip->i_d.di_nblocks--;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(tp, bp);
+        return 0;
+}
+STATIC int
+xfs_bmbt_get_minrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0) / 2;
+        }
+        return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+int
+xfs_bmbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0);
+        }
+        return cur->bc_mp->m_bmap_dmxr[level != 0];
+}
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level != cur->bc_nlevels - 1)
+                return cur->bc_mp->m_bmap_dmxr[level != 0];
+        return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+                                level == 0);
+}
+STATIC void
+xfs_bmbt_init_key_from_rec(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        key->bmbt.br_startoff =
+                cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+STATIC void
+xfs_bmbt_init_rec_from_key(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        ASSERT(key->bmbt.br_startoff != 0);
+        xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+                               0, 0, XFS_EXT_NORM);
+}
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        ptr->l = 0;
+}
+STATIC __int64_t
+xfs_bmbt_key_diff(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key)
+{
+        return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+                                      cur->bc_rec.b.br_startoff;
+}
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *k1,
+        union xfs_btree_key     *k2)
+{
+        return be64_to_cpu(k1->bmbt.br_startoff) <
+                be64_to_cpu(k2->bmbt.br_startoff);
+}
+STATIC int
+xfs_bmbt_recs_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *r1,
+        union xfs_btree_rec     *r2)
+{
+        return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+                xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+                xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif  /* DEBUG */
+#ifdef XFS_BTREE_TRACE
+ktrace_t        *xfs_bmbt_trace_buf;
+STATIC void
+xfs_bmbt_trace_enter(
+        struct xfs_btree_cur    *cur,
+        const char              *func,
+        char                    *s,
+        int                     type,
+        int                     line,
+        __psunsigned_t          a0,
+        __psunsigned_t          a1,
+        __psunsigned_t          a2,
+        __psunsigned_t          a3,
+        __psunsigned_t          a4,
+        __psunsigned_t          a5,
+        __psunsigned_t          a6,
+        __psunsigned_t          a7,
+        __psunsigned_t          a8,
+        __psunsigned_t          a9,
+        __psunsigned_t          a10)
+{
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        int                     whichfork = cur->bc_private.b.whichfork;
+        ktrace_enter(xfs_bmbt_trace_buf,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+        ktrace_enter(ip->i_btrace,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+}
+STATIC void
+xfs_bmbt_trace_cursor(
+        struct xfs_btree_cur    *cur,
+        __uint32_t              *s0,
+        __uint64_t              *l0,
+        __uint64_t              *l1)
+{
+        struct xfs_bmbt_rec_host r;
+        xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+        *s0 = (cur->bc_nlevels << 24) |
+              (cur->bc_private.b.flags << 16) |
+               cur->bc_private.b.allocated;
+        *l0 = r.l0;
+        *l1 = r.l1;
+}
+STATIC void
+xfs_bmbt_trace_key(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
+        __uint64_t              *l0,
+        __uint64_t              *l1)
+{
+        *l0 = be64_to_cpu(key->bmbt.br_startoff);
+        *l1 = 0;
+}
+STATIC void
+xfs_bmbt_trace_record(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        __uint64_t              *l0,
+        __uint64_t              *l1,
+        __uint64_t              *l2)
+{
+        struct xfs_bmbt_irec    irec;
+        xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+        *l0 = irec.br_startoff;
+        *l1 = irec.br_startblock;
+        *l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+        .rec_len                = sizeof(xfs_bmbt_rec_t),
+        .key_len                = sizeof(xfs_bmbt_key_t),
+        .dup_cursor             = xfs_bmbt_dup_cursor,
+        .update_cursor          = xfs_bmbt_update_cursor,
+        .alloc_block            = xfs_bmbt_alloc_block,
+        .free_block             = xfs_bmbt_free_block,
+        .get_maxrecs            = xfs_bmbt_get_maxrecs,
+        .get_minrecs            = xfs_bmbt_get_minrecs,
+        .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
+        .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
+        .key_diff               = xfs_bmbt_key_diff,
+#ifdef DEBUG
+        .keys_inorder           = xfs_bmbt_keys_inorder,
+        .recs_inorder           = xfs_bmbt_recs_inorder,
+#endif
+#ifdef XFS_BTREE_TRACE
+        .trace_enter            = xfs_bmbt_trace_enter,
+        .trace_cursor           = xfs_bmbt_trace_cursor,
+        .trace_key              = xfs_bmbt_trace_key,
+        .trace_record           = xfs_bmbt_trace_record,
+#endif
+};
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *                          /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+        struct xfs_mount        *mp,            /* file system mount point */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* inode owning the btree */
+        int                     whichfork)      /* data or attr fork */
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_cur    *cur;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+        cur->bc_tp = tp;
+        cur->bc_mp = mp;
+        cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+        cur->bc_btnum = XFS_BTNUM_BMAP;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
+        cur->bc_ops = &xfs_bmbt_ops;
+        cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+        cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+        cur->bc_private.b.ip = ip;
+        cur->bc_private.b.firstblock = NULLFSBLOCK;
+        cur->bc_private.b.flist = NULL;
+        cur->bc_private.b.allocated = 0;
+        cur->bc_private.b.flags = 0;
+        cur->bc_private.b.whichfork = whichfork;
+        return cur;
+}
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmbt_rec_t);
+        return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= sizeof(xfs_bmdr_block_t);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmdr_rec_t);
+        return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cd0d4b4bb81..a4555abb662 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -21,9 +21,10 @@
 #define XFS_BMAP_MAGIC  0x424d4150      /* 'BMAP' */
 struct xfs_btree_cur;
-struct xfs_btree_lblock;
+struct xfs_btree_block;
 struct xfs_mount;
 struct xfs_inode;
+struct xfs_trans;
 /*
 * Bmap root header, on-disk form only.
@@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key {
 /* btree pointer type */
 typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-/* btree block header type */
+/*
-typedef struct xfs_btree_lblock xfs_bmbt_block_t;
+ * Btree block header size depends on a superblock flag.
+ *
-#define XFS_BUF_TO_BMBT_BLOCK(bp)       ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
+ * (not quite yet, but soon)
+ */
-#define XFS_BMAP_RBLOCK_DSIZE(lev,cur)  ((cur)->bc_private.b.forksize)
+#define XFS_BMBT_BLOCK_LEN(mp)  XFS_BTREE_LBLOCK_LEN
-#define XFS_BMAP_RBLOCK_ISIZE(lev,cur)  \
-        ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
-                    (cur)->bc_private.b.whichfork)->if_broot_bytes)
+        ((xfs_bmbt_rec_t *) \
+                ((char *)(block) + \
-#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-                XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
-                        xfs_bmdr, (lev) == 0) : \
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
-                ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
+        ((xfs_bmbt_key_t *) \
-#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
+                ((char *)(block) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-                        XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
+                 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-                                xfs_bmbt, (lev) == 0) : \
-                        ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_bmbt_ptr_t *) \
-#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
+                ((char *)(block) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-                        XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\
+                 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
-                                xfs_bmdr, (lev) == 0) : \
+                 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-                        ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
+#define XFS_BMDR_REC_ADDR(block, index) \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+        ((xfs_bmdr_rec_t *) \
-                        XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
+                ((char *)(block) + \
-                                xfs_bmbt, (lev) == 0) : \
+                 sizeof(struct xfs_bmdr_block) + \
-                        ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
+                 ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-#define XFS_BMAP_REC_DADDR(bb,i,cur)    (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
+#define XFS_BMDR_KEY_ADDR(block, index) \
+        ((xfs_bmdr_key_t *) \
-#define XFS_BMAP_REC_IADDR(bb,i,cur)    (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
-#define XFS_BMAP_KEY_DADDR(bb,i,cur)    \
+                 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
-#define XFS_BMAP_KEY_IADDR(bb,i,cur)    \
+        ((xfs_bmdr_ptr_t *) \
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
-#define XFS_BMAP_PTR_DADDR(bb,i,cur)    \
+                 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(   \
+                 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
-                                be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_PTR_IADDR(bb,i,cur)    \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(   \
-                                be16_to_cpu((bb)->bb_level), cur)))
 /*
 * These are to be used when we know the size of the block and
 * we don't have a cursor.
 */
-#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
-        (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i))
+        XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-#define XFS_BMAP_BROOT_NUMRECS(bb)      be16_to_cpu((bb)->bb_numrecs)
-#define XFS_BMAP_BROOT_MAXRECS(sz)      XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
 #define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
-        (int)(sizeof(xfs_bmbt_block_t) + \
+        (int)(XFS_BTREE_LBLOCK_LEN + \
               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
 #define XFS_BMAP_BROOT_SPACE(bb) \
@@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t;
 */
 #define XFS_BM_MAXLEVELS(mp,w)          ((mp)->m_bm_maxlevels[(w)])
-#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
-        (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
-         be16_to_cpu((bb)->bb_level) == level && \
-         be16_to_cpu((bb)->bb_numrecs) > 0 && \
-         be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
-#ifdef __KERNEL__
-#if defined(XFS_BMBT_TRACE)
-/*
- * Trace buffer entry types.
- */
-#define XFS_BMBT_KTRACE_ARGBI   1
-#define XFS_BMBT_KTRACE_ARGBII  2
-#define XFS_BMBT_KTRACE_ARGFFFI 3
-#define XFS_BMBT_KTRACE_ARGI    4
-#define XFS_BMBT_KTRACE_ARGIFK  5
-#define XFS_BMBT_KTRACE_ARGIFR  6
-#define XFS_BMBT_KTRACE_ARGIK   7
-#define XFS_BMBT_KTRACE_CUR     8
-#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
-extern ktrace_t *xfs_bmbt_trace_buf;
-#endif
 /*
 * Prototypes for xfs_bmap.c to call.
 */
-extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int);
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
-extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *);
+                        struct xfs_btree_block *, int);
-extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
 extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
-                                                int, struct xfs_buf **bpp);
 extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
 extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
@@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
-                                int);
-extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
-                                xfs_fsblock_t, xfs_filblks_t, int *);
-extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
-                                xfs_fsblock_t, xfs_filblks_t, int *);
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
 extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
@@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
-extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t,
+                        xfs_bmdr_block_t *, int);
-                                xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t);
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_inode *, int);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cc593a84c34..7ed59267420 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -34,7 +34,9 @@
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
@@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                              /* number of records fitting in block */
-xfs_btree_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block) /* generic btree block pointer */
-{
-        switch (cur->bc_btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                return (int)XFS_ALLOC_BLOCK_MAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        case XFS_BTNUM_BMAP:
-                return (int)XFS_BMAP_BLOCK_IMAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        case XFS_BTNUM_INO:
-                return (int)XFS_INOBT_BLOCK_MAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        default:
-                ASSERT(0);
-                return 0;
-        }
-}
-/*
- * External routines.
- */
-#ifdef DEBUG
-/*
- * Debug routine: check that block header is ok.
- */
-void
-xfs_btree_check_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block, /* generic btree block pointer */
-        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer containing block, if any */
-{
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
-                xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
-                        bp);
-        else
-                xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
-                        bp);
-}
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-        xfs_btnum_t     btnum,          /* btree identifier */
-        void            *ak1,           /* pointer to left (lower) key */
-        void            *ak2)           /* pointer to right (higher) key */
-{
-        switch (btnum) {
-        case XFS_BTNUM_BNO: {
-                xfs_alloc_key_t *k1;
-                xfs_alloc_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
-                break;
-            }
-        case XFS_BTNUM_CNT: {
-                xfs_alloc_key_t *k1;
-                xfs_alloc_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
-                       (k1->ar_blockcount == k2->ar_blockcount &&
-                        be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
-                break;
-            }
-        case XFS_BTNUM_BMAP: {
-                xfs_bmbt_key_t  *k1;
-                xfs_bmbt_key_t  *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
-                break;
-            }
-        case XFS_BTNUM_INO: {
-                xfs_inobt_key_t *k1;
-                xfs_inobt_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
-                break;
-            }
-        default:
-                ASSERT(0);
-        }
-}
-#endif  /* DEBUG */
-/*
+STATIC int                              /* error (0 or EFSCORRUPTED) */
- * Checking routine: check that long form block header is ok.
- */
-/* ARGSUSED */
-int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_lblock_t      *block, /* btree long form block pointer */
+        struct xfs_btree_block  *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer for block, if any */
+        struct xfs_buf          *bp)    /* buffer for block, if any */
 {
        int                     lblock_ok; /* block passes checks */
-        xfs_mount_t             *mp;    /* file system mount point */
+        struct xfs_mount        *mp;    /* file system mount point */
        mp = cur->bc_mp;
        lblock_ok =
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                        cur->bc_ops->get_maxrecs(cur, level) &&
-                block->bb_leftsib &&
+                block->bb_u.l.bb_leftsib &&
-                (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO ||
+                (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
-                 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) &&
+                 XFS_FSB_SANITY_CHECK(mp,
-                block->bb_rightsib &&
+                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
-                (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO ||
+                block->bb_u.l.bb_rightsib &&
-                 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib)));
+                (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
-        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
+        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
                if (bp)
                        xfs_buftrace("LBTREE ERROR", bp);
@@ -189,98 +89,15 @@ xfs_btree_check_lblock(
        return 0;
 }
-/*
+STATIC int                              /* error (0 or EFSCORRUPTED) */
- * Checking routine: check that (long) pointer is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_dfsbno_t    ptr,            /* btree block disk address */
-        int             level)          /* btree block level */
-{
-        xfs_mount_t     *mp;            /* file system mount point */
-        mp = cur->bc_mp;
-        XFS_WANT_CORRUPTED_RETURN(
-                level > 0 &&
-                ptr != NULLDFSBNO &&
-                XFS_FSB_SANITY_CHECK(mp, ptr));
-        return 0;
-}
-#ifdef DEBUG
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-        xfs_btnum_t     btnum,          /* btree identifier */
-        void            *ar1,           /* pointer to left (lower) record */
-        void            *ar2)           /* pointer to right (higher) record */
-{
-        switch (btnum) {
-        case XFS_BTNUM_BNO: {
-                xfs_alloc_rec_t *r1;
-                xfs_alloc_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ar_startblock) +
-                       be32_to_cpu(r1->ar_blockcount) <=
-                       be32_to_cpu(r2->ar_startblock));
-                break;
-            }
-        case XFS_BTNUM_CNT: {
-                xfs_alloc_rec_t *r1;
-                xfs_alloc_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
-                       (r1->ar_blockcount == r2->ar_blockcount &&
-                        be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
-                break;
-            }
-        case XFS_BTNUM_BMAP: {
-                xfs_bmbt_rec_t  *r1;
-                xfs_bmbt_rec_t  *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(xfs_bmbt_disk_get_startoff(r1) +
-                       xfs_bmbt_disk_get_blockcount(r1) <=
-                       xfs_bmbt_disk_get_startoff(r2));
-                break;
-            }
-        case XFS_BTNUM_INO: {
-                xfs_inobt_rec_t *r1;
-                xfs_inobt_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
-                       be32_to_cpu(r2->ir_startino));
-                break;
-            }
-        default:
-                ASSERT(0);
-        }
-}
-#endif  /* DEBUG */
-/*
- * Checking routine: check that block header is ok.
- */
-/* ARGSUSED */
-int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_sblock_t      *block, /* btree short form block pointer */
+        struct xfs_btree_block  *block, /* btree short form block pointer */
        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer containing block */
+        struct xfs_buf          *bp)    /* buffer containing block */
 {
-        xfs_buf_t               *agbp;  /* buffer for ag. freespace struct */
+        struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
-        xfs_agf_t               *agf;   /* ag. freespace structure */
+        struct xfs_agf          *agf;   /* ag. freespace structure */
        xfs_agblock_t           agflen; /* native ag. freespace length */
        int                     sblock_ok; /* block passes checks */
@@ -291,13 +108,13 @@ xfs_btree_check_sblock(
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                        cur->bc_ops->get_maxrecs(cur, level) &&
-                (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK ||
+                (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
-                 be32_to_cpu(block->bb_leftsib) < agflen) &&
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
-                block->bb_leftsib &&
+                block->bb_u.s.bb_leftsib &&
-                (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK ||
+                (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
-                 be32_to_cpu(block->bb_rightsib) < agflen) &&
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
-                block->bb_rightsib;
+                block->bb_u.s.bb_rightsib;
        if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
@@ -311,27 +128,78 @@ xfs_btree_check_sblock(
 }
 /*
- * Checking routine: check that (short) pointer is ok.
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp)    /* buffer containing block, if any */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return xfs_btree_check_lblock(cur, block, level, bp);
+        else
+                return xfs_btree_check_sblock(cur, block, level, bp);
+}
+/*
+ * Check that (long) pointer is ok.
 */
 int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_dfsbno_t            bno,    /* btree block disk address */
+        int                     level)  /* btree block level */
+{
+        XFS_WANT_CORRUPTED_RETURN(
+                level > 0 &&
+                bno != NULLDFSBNO &&
+                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sptr(
-        xfs_btree_cur_t *cur,           /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agblock_t   ptr,            /* btree block disk address */
+        xfs_agblock_t           bno,    /* btree block disk address */
-        int             level)          /* btree block level */
+        int                     level)  /* btree block level */
 {
-        xfs_buf_t       *agbp;          /* buffer for ag. freespace struct */
+        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
-        xfs_agf_t       *agf;           /* ag. freespace structure */
-        agbp = cur->bc_private.a.agbp;
-        agf = XFS_BUF_TO_AGF(agbp);
        XFS_WANT_CORRUPTED_RETURN(
                level > 0 &&
-                ptr != NULLAGBLOCK && ptr != 0 &&
+                bno != NULLAGBLOCK &&
-                ptr < be32_to_cpu(agf->agf_length));
+                bno != 0 &&
+                bno < agblocks);
        return 0;
 }
 /*
+ * Check that block ptr is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_ptr     *ptr,   /* btree block disk address */
+        int                     index,  /* offset from ptr to check */
+        int                     level)  /* btree block level */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                return xfs_btree_check_lptr(cur,
+                                be64_to_cpu((&ptr->l)[index]), level);
+        } else {
+                return xfs_btree_check_sptr(cur,
+                                be32_to_cpu((&ptr->s)[index]), level);
+        }
+}
+#endif
+/*
 * Delete the btree cursor.
 */
 void
@@ -387,16 +255,17 @@ xfs_btree_dup_cursor(
        tp = cur->bc_tp;
        mp = cur->bc_mp;
        /*
         * Allocate a new cursor like the old one.
         */
-        new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
+        new = cur->bc_ops->dup_cursor(cur);
-                cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
-                cur->bc_private.b.whichfork);
        /*
         * Copy the record currently in the cursor.
         */
        new->bc_rec = cur->bc_rec;
        /*
         * For each level current, re-get the buffer and copy the ptr value.
         */
@@ -416,46 +285,174 @@ xfs_btree_dup_cursor(
                } else
                        new->bc_bufs[i] = NULL;
        }
-        /*
-         * For bmap btrees, copy the firstblock, flist, and flags values,
-         * since init cursor doesn't get them.
-         */
-        if (new->bc_btnum == XFS_BTNUM_BMAP) {
-                new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-                new->bc_private.b.flist = cur->bc_private.b.flist;
-                new->bc_private.b.flags = cur->bc_private.b.flags;
-        }
        *ncur = new;
        return 0;
 }
 /*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                XFS_BTREE_LBLOCK_LEN :
+                XFS_BTREE_SBLOCK_LEN;
+}
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                sizeof(__be64) : sizeof(__be32);
+}
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->rec_len;
+}
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->key_len;
+}
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        int                     level)
+{
+        return xfs_btree_block_len(cur) +
+                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+                (n - 1) * xfs_btree_ptr_len(cur);
+}
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_rec *)
+                ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_key *)
+                ((char *)block + xfs_btree_key_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        int                     level = xfs_btree_get_level(block);
+        ASSERT(block->bb_level != 0);
+        return (union xfs_btree_ptr *)
+                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+/*
 * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
+ * This may be an inode btree root or from a buffer.
 */
-STATIC xfs_btree_block_t *              /* generic btree block pointer */
+STATIC struct xfs_btree_block *         /* generic btree block pointer */
 xfs_btree_get_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     level,  /* level in btree */
-        xfs_buf_t               **bpp)  /* buffer containing the block */
+        struct xfs_buf          **bpp)  /* buffer containing the block */
-{
+{
-        xfs_btree_block_t       *block; /* return value */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-        xfs_buf_t               *bp;    /* return buffer */
+            (level == cur->bc_nlevels - 1)) {
-        xfs_ifork_t             *ifp;   /* inode fork pointer */
+                *bpp = NULL;
-        int                     whichfork; /* data or attr fork */
+                return xfs_btree_get_iroot(cur);
-        if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
-                whichfork = cur->bc_private.b.whichfork;
-                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
-                block = (xfs_btree_block_t *)ifp->if_broot;
-                bp = NULL;
-        } else {
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_BLOCK(bp);
        }
-        ASSERT(block != NULL);
-        *bpp = bp;
+        *bpp = cur->bc_bufs[level];
-        return block;
+        return XFS_BUF_TO_BLOCK(*bpp);
 }
 /*
@@ -505,97 +502,6 @@ xfs_btree_get_bufs(
 }
 /*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B) or inodes (I).
- */
-xfs_btree_cur_t *                       /* new btree cursor */
-xfs_btree_init_cursor(
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_buf_t       *agbp,          /* (A only) buffer for agf structure */
-                                        /* (I only) buffer for agi structure */
-        xfs_agnumber_t  agno,           /* (AI only) allocation group number */
-        xfs_btnum_t     btnum,          /* btree identifier */
-        xfs_inode_t     *ip,            /* (B only) inode owning the btree */
-        int             whichfork)      /* (B only) data or attr fork */
-{
-        xfs_agf_t       *agf;           /* (A) allocation group freespace */
-        xfs_agi_t       *agi;           /* (I) allocation group inodespace */
-        xfs_btree_cur_t *cur;           /* return value */
-        xfs_ifork_t     *ifp;           /* (I) inode fork pointer */
-        int             nlevels=0;      /* number of levels in the btree */
-        ASSERT(xfs_btree_cur_zone != NULL);
-        /*
-         * Allocate a new cursor.
-         */
-        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-        /*
-         * Deduce the number of btree levels from the arguments.
-         */
-        switch (btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                agf = XFS_BUF_TO_AGF(agbp);
-                nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-                break;
-        case XFS_BTNUM_BMAP:
-                ifp = XFS_IFORK_PTR(ip, whichfork);
-                nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-                break;
-        case XFS_BTNUM_INO:
-                agi = XFS_BUF_TO_AGI(agbp);
-                nlevels = be32_to_cpu(agi->agi_level);
-                break;
-        default:
-                ASSERT(0);
-        }
-        /*
-         * Fill in the common fields.
-         */
-        cur->bc_tp = tp;
-        cur->bc_mp = mp;
-        cur->bc_nlevels = nlevels;
-        cur->bc_btnum = btnum;
-        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        /*
-         * Fill in private fields.
-         */
-        switch (btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                /*
-                 * Allocation btree fields.
-                 */
-                cur->bc_private.a.agbp = agbp;
-                cur->bc_private.a.agno = agno;
-                break;
-        case XFS_BTNUM_INO:
-                /*
-                 * Inode allocation btree fields.
-                 */
-                cur->bc_private.a.agbp = agbp;
-                cur->bc_private.a.agno = agno;
-                break;
-        case XFS_BTNUM_BMAP:
-                /*
-                 * Bmap btree fields.
-                 */
-                cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
-                cur->bc_private.b.ip = ip;
-                cur->bc_private.b.firstblock = NULLFSBLOCK;
-                cur->bc_private.b.flist = NULL;
-                cur->bc_private.b.allocated = 0;
-                cur->bc_private.b.flags = 0;
-                cur->bc_private.b.whichfork = whichfork;
-                break;
-        default:
-                ASSERT(0);
-        }
-        return cur;
-}
-/*
 * Check for the cursor referring to the last block at the given level.
 */
 int                                     /* 1=is last block, 0=not last block */
@@ -603,12 +509,12 @@ xfs_btree_islastblock(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to check */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        block = xfs_btree_get_block(cur, level, &bp);
        xfs_btree_check_block(cur, block, level, bp);
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
                return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
        else
                return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
@@ -618,12 +524,12 @@ xfs_btree_islastblock(
 * Change the cursor to point to the first record at the given level.
 * Other levels are unaffected.
 */
-int                                     /* success=1, failure=0 */
+STATIC int                              /* success=1, failure=0 */
 xfs_btree_firstrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        /*
@@ -634,7 +540,7 @@ xfs_btree_firstrec(
        /*
         * It's empty, there is no such record.
         */
-        if (!block->bb_h.bb_numrecs)
+        if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to 1, that's the first record/key.
@@ -647,12 +553,12 @@ xfs_btree_firstrec(
 * Change the cursor to point to the last record in the current block
 * at the given level.  Other levels are unaffected.
 */
-int                                     /* success=1, failure=0 */
+STATIC int                              /* success=1, failure=0 */
 xfs_btree_lastrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        /*
@@ -663,12 +569,12 @@ xfs_btree_lastrec(
        /*
         * It's empty, there is no such record.
         */
-        if (!block->bb_h.bb_numrecs)
+        if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to numrecs, that's the last record/key.
         */
-        cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs);
+        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
        return 1;
 }
@@ -817,66 +723,84 @@ xfs_btree_reada_bufs(
        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
+STATIC int
+xfs_btree_readahead_lblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block  *block)
+{
+        int                     rval = 0;
+        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+                rval++;
+        }
+        return rval;
+}
+STATIC int
+xfs_btree_readahead_sblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block *block)
+{
+        int                     rval = 0;
+        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     left, 1);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     right, 1);
+                rval++;
+        }
+        return rval;
+}
 /*
 * Read-ahead btree blocks, at the given level.
 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 */
-int
+STATIC int
-xfs_btree_readahead_core(
+xfs_btree_readahead(
-        xfs_btree_cur_t         *cur,           /* btree cursor */
+        struct xfs_btree_cur    *cur,           /* btree cursor */
        int                     lev,            /* level in btree */
        int                     lr)             /* left/right bits */
 {
-        xfs_alloc_block_t       *a;
+        struct xfs_btree_block  *block;
-        xfs_bmbt_block_t        *b;
-        xfs_inobt_block_t       *i;
+        /*
-        int                     rval = 0;
+         * No readahead needed if we are at the root level and the
+         * btree root is stored in the inode.
+         */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (lev == cur->bc_nlevels - 1))
+                return 0;
+        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+                return 0;
-        ASSERT(cur->bc_bufs[lev] != NULL);
        cur->bc_ra[lev] |= lr;
-        switch (cur->bc_btnum) {
+        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
+                return xfs_btree_readahead_lblock(cur, lr, block);
-                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) {
+        return xfs_btree_readahead_sblock(cur, lr, block);
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(a->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(a->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        case XFS_BTNUM_BMAP:
-                b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
-                if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
-                        xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
-                        xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        case XFS_BTNUM_INO:
-                i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
-                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(i->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(i->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        default:
-                ASSERT(0);
-        }
-        return rval;
 }
 /*
@@ -889,7 +813,7 @@ xfs_btree_setbuf(
        int                     lev,    /* level in btree */
        xfs_buf_t               *bp)    /* new buffer to set */
 {
-        xfs_btree_block_t       *b;     /* btree block */
+        struct xfs_btree_block  *b;     /* btree block */
        xfs_buf_t               *obp;   /* old buffer pointer */
        obp = cur->bc_bufs[lev];
@@ -900,7 +824,7 @@ xfs_btree_setbuf(
        if (!bp)
                return;
        b = XFS_BUF_TO_BLOCK(bp);
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
                if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
@@ -912,3 +836,2855 @@ xfs_btree_setbuf(
                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
        }
 }
+STATIC int
+xfs_btree_ptr_is_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+        else
+                return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+STATIC void
+xfs_btree_set_ptr_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(NULLFSBLOCK);
+        else
+                ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->l = block->bb_u.l.bb_rightsib;
+                else
+                        ptr->l = block->bb_u.l.bb_leftsib;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->s = block->bb_u.s.bb_rightsib;
+                else
+                        ptr->s = block->bb_u.s.bb_leftsib;
+        }
+}
+STATIC void
+xfs_btree_set_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.l.bb_rightsib = ptr->l;
+                else
+                        block->bb_u.l.bb_leftsib = ptr->l;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.s.bb_rightsib = ptr->s;
+                else
+                        block->bb_u.s.bb_leftsib = ptr->s;
+        }
+}
+STATIC void
+xfs_btree_init_block(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     numrecs,
+        struct xfs_btree_block  *new)   /* new block */
+{
+        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        new->bb_level = cpu_to_be16(level);
+        new->bb_numrecs = cpu_to_be16(numrecs);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+                new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+        } else {
+                new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+        }
+}
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateѕ to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        int                     level)
+{
+        union xfs_btree_ptr     ptr;
+        if (level > 0)
+                return 0;
+        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+                return 0;
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &ptr))
+                return 0;
+        return 1;
+}
+STATIC void
+xfs_btree_buf_to_ptr(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        else {
+                ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        }
+}
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+        } else {
+                ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+                ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+                return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                        be32_to_cpu(ptr->s));
+        }
+}
+STATIC void
+xfs_btree_set_refs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        switch (cur->bc_btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                break;
+        case XFS_BTNUM_INO:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                break;
+        case XFS_BTNUM_BMAP:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                break;
+        default:
+                ASSERT(0);
+        }
+}
+STATIC int
+xfs_btree_get_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+                                 mp->m_bsize, flags);
+        ASSERT(*bpp);
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        return 0;
+}
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     level,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        int                     error;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+                                   mp->m_bsize, flags, bpp);
+        if (error)
+                return error;
+        ASSERT(*bpp != NULL);
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        xfs_btree_set_refs(cur, *bpp);
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        error = xfs_btree_check_block(cur, *block, level, *bpp);
+        if (error)
+                xfs_trans_brelse(cur->bc_tp, *bpp);
+        return error;
+}
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *dst_key,
+        union xfs_btree_key     *src_key,
+        int                     numkeys)
+{
+        ASSERT(numkeys >= 0);
+        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *dst_rec,
+        union xfs_btree_rec     *src_rec,
+        int                     numrecs)
+{
+        ASSERT(numrecs >= 0);
+        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *dst_ptr,
+        union xfs_btree_ptr     *src_ptr,
+        int                     numptrs)
+{
+        ASSERT(numptrs >= 0);
+        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
+        int                     dir,
+        int                     numkeys)
+{
+        char                    *dst_key;
+        ASSERT(numkeys >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        int                     dir,
+        int                     numrecs)
+{
+        char                    *dst_rec;
+        ASSERT(numrecs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     dir,
+        int                     numptrs)
+{
+        char                    *dst_ptr;
+        ASSERT(numptrs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                  xfs_btree_key_offset(cur, first),
+                                  xfs_btree_key_offset(cur, last + 1) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        xfs_trans_log_buf(cur->bc_tp, bp,
+                          xfs_btree_rec_offset(cur, first),
+                          xfs_btree_rec_offset(cur, last + 1) - 1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     first,  /* index of first pointer to log */
+        int                     last)   /* index of last pointer to log */
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+                int                     level = xfs_btree_get_level(block);
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                xfs_btree_ptr_offset(cur, first, level),
+                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     fields) /* mask of fields: XFS_BB_... */
+{
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        static const short      soffsets[] = {  /* table of offsets (short) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+                XFS_BTREE_SBLOCK_LEN
+        };
+        static const short      loffsets[] = {  /* table of offsets (long) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+                XFS_BTREE_LBLOCK_LEN
+        };
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+        if (bp) {
+                xfs_btree_offsets(fields,
+                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                                        loffsets : soffsets,
+                                  XFS_BB_NUM_BITS, &first, &last);
+                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_increment(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        union xfs_btree_ptr     ptr;
+        struct xfs_buf          *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the right at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* We're done if we remain in the block after the increment. */
+        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+                goto out1;
+        /* Fail if we just went off the right edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, increment);
+        /*
+         * March up the tree incrementing pointers.
+         * Stop when we don't go off the right edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                block = xfs_btree_get_block(cur, lev, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, lev, bp);
+                if (error)
+                        goto error0;
+#endif
+                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+                        break;
+                /* Read-ahead the right block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+        }
+        /*
+         * If we went off the root then we are either seriously
+         * confused or have the tree root in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                        0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = 1;
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_decrement(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        xfs_buf_t               *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        union xfs_btree_ptr     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the left at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+        /* We're done if we remain in the block after the decrement. */
+        if (--cur->bc_ptrs[level] > 0)
+                goto out1;
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we just went off the left edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, decrement);
+        /*
+         * March up the tree decrementing pointers.
+         * Stop when we don't go off the left edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                if (--cur->bc_ptrs[lev] > 0)
+                        break;
+                /* Read-ahead the left block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         * or the root of the tree is in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                        0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_btree_lookup_get_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in the btree */
+        union xfs_btree_ptr     *pp,    /* ptr to btree block */
+        struct xfs_btree_block  **blkp) /* return btree block */
+{
+        struct xfs_buf          *bp;    /* buffer pointer for btree block */
+        int                     error = 0;
+        /* special case the root block if in an inode */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1)) {
+                *blkp = xfs_btree_get_iroot(cur);
+                return 0;
+        }
+        /*
+         * If the old buffer at this level for the disk address we are
+         * looking for re-use it.
+         *
+         * Otherwise throw it away and get a new one.
+         */
+        bp = cur->bc_bufs[level];
+        if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+                *blkp = XFS_BUF_TO_BLOCK(bp);
+                return 0;
+        }
+        error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+        if (error)
+                return error;
+        xfs_btree_setbuf(cur, level, bp);
+        return 0;
+}
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     keyno,
+        struct xfs_btree_block  *block,
+        union xfs_btree_key     *kp)
+{
+        if (level == 0) {
+                cur->bc_ops->init_key_from_rec(kp,
+                                xfs_btree_rec_addr(cur, keyno, block));
+                return kp;
+        }
+        return xfs_btree_key_addr(cur, keyno, block);
+}
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int                                     /* error */
+xfs_btree_lookup(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* current btree block */
+        __int64_t               diff;   /* difference for the current key */
+        int                     error;  /* error return value */
+        int                     keyno;  /* current key number */
+        int                     level;  /* level in the btree */
+        union xfs_btree_ptr     *pp;    /* ptr to btree block */
+        union xfs_btree_ptr     ptr;    /* ptr to btree block */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, dir);
+        XFS_BTREE_STATS_INC(cur, lookup);
+        block = NULL;
+        keyno = 0;
+        /* initialise start pointer from cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+        pp = &ptr;
+        /*
+         * Iterate over each level in the btree, starting at the root.
+         * For each level above the leaves, find the key we need, based
+         * on the lookup record, then follow the corresponding block
+         * pointer down to the next level.
+         */
+        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+                /* Get the block we need to do the lookup on. */
+                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+                if (error)
+                        goto error0;
+                if (diff == 0) {
+                        /*
+                         * If we already had a key match at a higher level, we
+                         * know we need to use the first entry in this block.
+                         */
+                        keyno = 1;
+                } else {
+                        /* Otherwise search this block. Do a binary search. */
+                        int     high;   /* high entry number */
+                        int     low;    /* low entry number */
+                        /* Set low and high entry numbers, 1-based. */
+                        low = 1;
+                        high = xfs_btree_get_numrecs(block);
+                        if (!high) {
+                                /* Block is empty, must be an empty leaf. */
+                                ASSERT(level == 0 && cur->bc_nlevels == 1);
+                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 0;
+                                return 0;
+                        }
+                        /* Binary search the block. */
+                        while (low <= high) {
+                                union xfs_btree_key     key;
+                                union xfs_btree_key     *kp;
+                                XFS_BTREE_STATS_INC(cur, compare);
+                                /* keyno is average of low and high. */
+                                keyno = (low + high) >> 1;
+                                /* Get current search key */
+                                kp = xfs_lookup_get_search_key(cur, level,
+                                                keyno, block, &key);
+                                /*
+                                 * Compute difference to get next direction:
+                                 *  - less than, move right
+                                 *  - greater than, move left
+                                 *  - equal, we're done
+                                 */
+                                diff = cur->bc_ops->key_diff(cur, kp);
+                                if (diff < 0)
+                                        low = keyno + 1;
+                                else if (diff > 0)
+                                        high = keyno - 1;
+                                else
+                                        break;
+                        }
+                }
+                /*
+                 * If there are more levels, set up for the next level
+                 * by getting the block number and filling in the cursor.
+                 */
+                if (level > 0) {
+                        /*
+                         * If we moved left, need the previous key number,
+                         * unless there isn't one.
+                         */
+                        if (diff > 0 && --keyno < 1)
+                                keyno = 1;
+                        pp = xfs_btree_ptr_addr(cur, keyno, block);
+#ifdef DEBUG
+                        error = xfs_btree_check_ptr(cur, pp, 0, level);
+                        if (error)
+                                goto error0;
+#endif
+                        cur->bc_ptrs[level] = keyno;
+                }
+        }
+        /* Done with the search. See if we need to adjust the results. */
+        if (dir != XFS_LOOKUP_LE && diff < 0) {
+                keyno++;
+                /*
+                 * If ge search and we went off the end of the block, but it's
+                 * not the last block, we're in the wrong block.
+                 */
+                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+                if (dir == XFS_LOOKUP_GE &&
+                    keyno > xfs_btree_get_numrecs(block) &&
+                    !xfs_btree_ptr_is_null(cur, &ptr)) {
+                        int     i;
+                        cur->bc_ptrs[0] = keyno;
+                        error = xfs_btree_increment(cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                        *stat = 1;
+                        return 0;
+                }
+        } else if (dir == XFS_LOOKUP_LE && diff > 0)
+                keyno--;
+        cur->bc_ptrs[0] = keyno;
+        /* Return if we succeeded or not. */
+        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+                *stat = 0;
+        else if (dir != XFS_LOOKUP_EQ || diff == 0)
+                *stat = 1;
+        else
+                *stat = 0;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *keyp,
+        int                     level)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        union xfs_btree_key     *kp;
+        int                     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+        ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+        /*
+         * Go up the tree from this level toward the root.
+         * At each level, update the key value to the value input.
+         * Stop when we reach a level where the cursor isn't pointing
+         * at the first entry in the block.
+         */
+        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+                int             error;
+#endif
+                block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, level, bp);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+#endif
+                ptr = cur->bc_ptrs[level];
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                xfs_btree_copy_keys(cur, kp, keyp, 1);
+                xfs_btree_log_keys(cur, bp, ptr, ptr);
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        int                     error;
+        int                     ptr;
+        union xfs_btree_rec     *rp;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGR(cur, rec);
+        /* Pick up the current block. */
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Get the address of the rec to be updated. */
+        ptr = cur->bc_ptrs[0];
+        rp = xfs_btree_rec_addr(cur, ptr, block);
+        /* Fill in the new contents and log them. */
+        xfs_btree_copy_recs(cur, rp, rec, 1);
+        xfs_btree_log_recs(cur, bp, ptr, ptr);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, 0)) {
+                cur->bc_ops->update_lastrec(cur, block, rec,
+                                            ptr, LASTREC_UPDATE);
+        }
+        /* Updating first rec in leaf. Pass new key value up to our parent. */
+        if (ptr == 1) {
+                union xfs_btree_key     key;
+                cur->bc_ops->init_key_from_rec(&key, rec);
+                error = xfs_btree_updkey(cur, &key, 1);
+                if (error)
+                        goto error0;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_lshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs;          /* left record count */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        int                     rrecs;          /* right record count */
+        union xfs_btree_ptr     lptr;           /* left btree pointer */
+        union xfs_btree_key     *rkp = NULL;    /* right btree key */
+        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
+        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
+        int                     error;          /* error return value */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1)
+                goto out0;
+        /* Set up variables for this block as "right". */
+        right = xfs_btree_get_block(cur, level, &rbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, right, level, rbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no left sibling then we can't shift an entry left. */
+        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &lptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] <= 1)
+                goto out0;
+        /* Set up the left neighbor as "left". */
+        error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        rrecs = xfs_btree_get_numrecs(right);
+        /*
+         * We add one entry to the left side and remove one for the right side.
+         * Accout for it here, the changes will be updated on disk and logged
+         * later.
+         */
+        lrecs++;
+        rrecs--;
+        XFS_BTREE_STATS_INC(cur, lshift);
+        XFS_BTREE_STATS_ADD(cur, moves, 1);
+        /*
+         * If non-leaf, copy a key and a ptr to the left block.
+         * Log the changes to the left block.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, rpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, 1);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->keys_inorder(cur,
+                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, 1);
+                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->recs_inorder(cur,
+                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+        }
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Slide the contents of right down one entry.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+                int                     i;              /* loop index */
+                for (i = 0; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur,
+                                xfs_btree_key_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_shift_ptrs(cur,
+                                xfs_btree_ptr_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+        } else {
+                /* It's a leaf. operate on records */
+                xfs_btree_shift_recs(cur,
+                        xfs_btree_rec_addr(cur, 2, right),
+                        -1, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                cur->bc_ops->init_key_from_rec(&key,
+                        xfs_btree_rec_addr(cur, 1, right));
+                rkp = &key;
+        }
+        /* Update the parent key values of right. */
+        error = xfs_btree_updkey(cur, rkp, level + 1);
+        if (error)
+                goto error0;
+        /* Slide the cursor value left one. */
+        cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_rshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        union xfs_btree_ptr     rptr;           /* right block pointer */
+        union xfs_btree_key     *rkp;           /* right btree key */
+        int                     rrecs;          /* right record count */
+        int                     lrecs;          /* left record count */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1))
+                goto out0;
+        /* Set up variables for this block as "left". */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no right sibling then we can't shift an entry right. */
+        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &rptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (cur->bc_ptrs[level] >= lrecs)
+                goto out0;
+        /* Set up the right neighbor as "right". */
+        error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        rrecs = xfs_btree_get_numrecs(right);
+        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, rshift);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Make a hole at the start of the right neighbor block, then
+         * copy the last left block entry to the hole.
+         */
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                union xfs_btree_ptr     *rpp;
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = rrecs - 1; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, lpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_keys(cur, rkp, lkp, 1);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+                        xfs_btree_key_addr(cur, 2, right)));
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec     *lrp;
+                union xfs_btree_rec     *rrp;
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_recs(cur, rrp, lrp, 1);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+                cur->bc_ops->init_key_from_rec(&key, rrp);
+                rkp = &key;
+                ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+                        xfs_btree_rec_addr(cur, 2, right)));
+        }
+        /*
+         * Decrement and log left's numrecs, bump and log right's numrecs.
+         */
+        xfs_btree_set_numrecs(left, --lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, ++rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Using a temporary cursor, update the parent key values of the
+         * block on the right.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        i = xfs_btree_lastrec(tcur, level);
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        error = xfs_btree_increment(tcur, level, &i);
+        if (error)
+                goto error1;
+        error = xfs_btree_updkey(tcur, rkp, level + 1);
+        if (error)
+                goto error1;
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+error1:
+        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int                                      /* error */
+xfs_btree_split(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        union xfs_btree_ptr     *ptrp,
+        union xfs_btree_key     *key,
+        struct xfs_btree_cur    **curp,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        int                     lrecs;
+        int                     rrecs;
+        int                     src_index;
+        int                     error;          /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+        XFS_BTREE_STATS_INC(cur, split);
+        /* Set up left block (current one). */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block as "right". */
+        error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* Fill in the btree header for the new right block. */
+        xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+        /*
+         * Split the entries between the old and the new block evenly.
+         * Make sure that if there's an odd number of entries now, that
+         * each new block will have the same number of entries.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        rrecs = lrecs / 2;
+        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+                rrecs++;
+        src_index = (lrecs - rrecs + 1);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Copy btree block entries from the left block over to the
+         * new block, the right. Update the right block and log the
+         * changes.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, src_index, left);
+                lpp = xfs_btree_ptr_addr(cur, src_index, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = src_index; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+                /* Grab the keys to the entries moved to the right block */
+                xfs_btree_copy_keys(cur, key, rkp, 1);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, src_index, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                cur->bc_ops->init_key_from_rec(key,
+                        xfs_btree_rec_addr(cur, 1, right));
+        }
+        /*
+         * Find the left block number by looking in the buffer.
+         * Adjust numrecs, sibling pointers.
+         */
+        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        lrecs -= rrecs;
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there's a block to the new block's right, make that block
+         * point back to right instead of to left.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+                error = xfs_btree_read_buf_block(cur, &rrptr, level,
+                                                        0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * If the cursor is really in the right block, move it there.
+         * If it's just pointing past the last entry in left, then we'll
+         * insert there, so don't change anything in that case.
+         */
+        if (cur->bc_ptrs[level] > lrecs + 1) {
+                xfs_btree_setbuf(cur, level, rbp);
+                cur->bc_ptrs[level] -= lrecs;
+        }
+        /*
+         * If there are more levels, we'll need another cursor which refers
+         * the right block, no matter where this cursor was.
+         */
+        if (level + 1 < cur->bc_nlevels) {
+                error = xfs_btree_dup_cursor(cur, curp);
+                if (error)
+                        goto error0;
+                (*curp)->bc_ptrs[level + 1]++;
+        }
+        *ptrp = rptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int                                             /* error */
+xfs_btree_new_iroot(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     *logflags,      /* logging flags for inode */
+        int                     *stat)          /* return status - 0 fail */
+{
+        struct xfs_buf          *cbp;           /* buffer for cblock */
+        struct xfs_btree_block  *block;         /* btree block */
+        struct xfs_btree_block  *cblock;        /* child btree block */
+        union xfs_btree_key     *ckp;           /* child key pointer */
+        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
+        union xfs_btree_key     *kp;            /* pointer to btree key */
+        union xfs_btree_ptr     *pp;            /* pointer to block addr */
+        union xfs_btree_ptr     nptr;           /* new block addr */
+        int                     level;          /* btree level */
+        int                     error;          /* error return code */
+#ifdef DEBUG
+        int                     i;              /* loop counter */
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        level = cur->bc_nlevels - 1;
+        block = xfs_btree_get_iroot(cur);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Copy the root into a real block. */
+        error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+        if (error)
+                goto error0;
+        memcpy(cblock, block, xfs_btree_block_len(cur));
+        be16_add_cpu(&block->bb_level, 1);
+        xfs_btree_set_numrecs(block, 1);
+        cur->bc_nlevels++;
+        cur->bc_ptrs[level + 1] = 1;
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+                error = xfs_btree_check_ptr(cur, pp, i, level);
+                if (error)
+                        goto error0;
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+#ifdef DEBUG
+        error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+        xfs_iroot_realloc(cur->bc_private.b.ip,
+                          1 - xfs_btree_get_numrecs(cblock),
+                          cur->bc_private.b.whichfork);
+        xfs_btree_setbuf(cur, level, cbp);
+        /*
+         * Do all this logging at the end so that
+         * the root is at the right level.
+         */
+        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        *logflags |=
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+        *stat = 1;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                              /* error */
+xfs_btree_new_root(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* one half of the old root block */
+        struct xfs_buf          *bp;    /* buffer containing block */
+        int                     error;  /* error return value */
+        struct xfs_buf          *lbp;   /* left buffer pointer */
+        struct xfs_btree_block  *left;  /* left btree block */
+        struct xfs_buf          *nbp;   /* new (root) buffer */
+        struct xfs_btree_block  *new;   /* new (root) btree block */
+        int                     nptr;   /* new value for key index, 1 or 2 */
+        struct xfs_buf          *rbp;   /* right buffer pointer */
+        struct xfs_btree_block  *right; /* right btree block */
+        union xfs_btree_ptr     rptr;
+        union xfs_btree_ptr     lptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        /* initialise our start point from the cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block. */
+        error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+        if (error)
+                goto error0;
+        /* Set the root in the holding structure  increasing the level by 1. */
+        cur->bc_ops->set_root(cur, &lptr, 1);
+        /*
+         * At the previous root level there are now two blocks: the old root,
+         * and the new block generated when it was split.  We don't know which
+         * one the cursor is pointing at, so we set up variables "left" and
+         * "right" for each case.
+         */
+        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /* Our block is left, pick up the right block. */
+                lbp = bp;
+                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+                left = block;
+                error = xfs_btree_read_buf_block(cur, &rptr,
+                                        cur->bc_nlevels - 1, 0, &right, &rbp);
+                if (error)
+                        goto error0;
+                bp = rbp;
+                nptr = 1;
+        } else {
+                /* Our block is right, pick up the left block. */
+                rbp = bp;
+                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+                right = block;
+                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+                error = xfs_btree_read_buf_block(cur, &lptr,
+                                        cur->bc_nlevels - 1, 0, &left, &lbp);
+                if (error)
+                        goto error0;
+                bp = lbp;
+                nptr = 2;
+        }
+        /* Fill in the new block's btree header and log it. */
+        xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+                        !xfs_btree_ptr_is_null(cur, &rptr));
+        /* Fill in the key data in the new root. */
+        if (xfs_btree_get_level(left) > 0) {
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_key_addr(cur, 1, left), 1);
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_key_addr(cur, 1, right), 1);
+        } else {
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_rec_addr(cur, 1, left));
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_rec_addr(cur, 1, right));
+        }
+        xfs_btree_log_keys(cur, nbp, 1, 2);
+        /* Fill in the pointer data in the new root. */
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+        xfs_btree_log_ptrs(cur, nbp, 1, 2);
+        /* Fix up the cursor. */
+        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+        cur->bc_ptrs[cur->bc_nlevels] = nptr;
+        cur->bc_nlevels++;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+}
+STATIC int
+xfs_btree_make_block_unfull(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* btree level */
+        int                     numrecs,/* # of recs in block */
+        int                     *oindex,/* old tree index */
+        int                     *index, /* new tree index */
+        union xfs_btree_ptr     *nptr,  /* new btree ptr */
+        struct xfs_btree_cur    **ncur, /* new btree cursor */
+        union xfs_btree_rec     *nrec,  /* new record */
+        int                     *stat)
+{
+        union xfs_btree_key     key;    /* new btree key value */
+        int                     error = 0;
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1) {
+                struct xfs_inode *ip = cur->bc_private.b.ip;
+                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+                        /* A root block that can be made bigger. */
+                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+                } else {
+                        /* A root block that needs replacing */
+                        int     logflags = 0;
+                        error = xfs_btree_new_iroot(cur, &logflags, stat);
+                        if (error || *stat == 0)
+                                return error;
+                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+                }
+                return 0;
+        }
+        /* First, try shifting an entry to the right neighbor. */
+        error = xfs_btree_rshift(cur, level, stat);
+        if (error || *stat)
+                return error;
+        /* Next, try shifting an entry to the left neighbor. */
+        error = xfs_btree_lshift(cur, level, stat);
+        if (error)
+                return error;
+        if (*stat) {
+                *oindex = *index = cur->bc_ptrs[level];
+                return 0;
+        }
+        /*
+         * Next, try splitting the current block in half.
+         *
+         * If this works we have to re-set our variables because we
+         * could be in a different block now.
+         */
+        error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+        if (error || *stat == 0)
+                return error;
+        *index = cur->bc_ptrs[level];
+        cur->bc_ops->init_rec_from_key(&key, nrec);
+        return 0;
+}
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level to insert record at */
+        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
+        union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer for block */
+        union xfs_btree_key     key;    /* btree key */
+        union xfs_btree_ptr     nptr;   /* new block ptr */
+        struct xfs_btree_cur    *ncur;  /* new btree cursor */
+        union xfs_btree_rec     nrec;   /* new record count */
+        int                     optr;   /* old key/record index */
+        int                     ptr;    /* key/record index */
+        int                     numrecs;/* number of records */
+        int                     error;  /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+        ncur = NULL;
+        /*
+         * If we have an external root pointer, and we've made it to the
+         * root level, allocate a new root block and we're done.
+         */
+        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level >= cur->bc_nlevels)) {
+                error = xfs_btree_new_root(cur, stat);
+                xfs_btree_set_ptr_null(cur, ptrp);
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return error;
+        }
+        /* If we're off the left edge, return failure. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Make a key out of the record data to be inserted, and save it. */
+        cur->bc_ops->init_key_from_rec(&key, recp);
+        optr = ptr;
+        XFS_BTREE_STATS_INC(cur, insrec);
+        /* Get pointers to the btree buffer and block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+        /* Check that the new entry is being inserted in the right place. */
+        if (ptr <= numrecs) {
+                if (level == 0) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                                xfs_btree_rec_addr(cur, ptr, block)));
+                } else {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                                xfs_btree_key_addr(cur, ptr, block)));
+                }
+        }
+#endif
+        /*
+         * If the block is full, we can't insert the new entry until we
+         * make the block un-full.
+         */
+        xfs_btree_set_ptr_null(cur, &nptr);
+        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+                error = xfs_btree_make_block_unfull(cur, level, numrecs,
+                                        &optr, &ptr, &nptr, &ncur, &nrec, stat);
+                if (error || *stat == 0)
+                        goto error0;
+        }
+        /*
+         * The current block may have changed if the block was
+         * previously full and we have just made space in it.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * At this point we know there's room for our new entry in the block
+         * we're pointing at.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *kp;
+                union xfs_btree_ptr     *pp;
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                pp = xfs_btree_ptr_addr(cur, ptr, block);
+#ifdef DEBUG
+                for (i = numrecs - ptr; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, pp, i, level);
+                        if (error)
+                                return error;
+                }
+#endif
+                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_keys(cur, kp, &key, 1);
+                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+                numrecs++;
+                xfs_btree_set_numrecs(block, numrecs);
+                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+                xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+                                xfs_btree_key_addr(cur, ptr + 1, block)));
+                }
+#endif
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec             *rp;
+                rp = xfs_btree_rec_addr(cur, ptr, block);
+                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_recs(cur, rp, recp, 1);
+                xfs_btree_set_numrecs(block, ++numrecs);
+                xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+                                xfs_btree_rec_addr(cur, ptr + 1, block)));
+                }
+#endif
+        }
+        /* Log the new number of records in the btree header. */
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /* If we inserted at the start of a block, update the parents' keys. */
+        if (optr == 1) {
+                error = xfs_btree_updkey(cur, &key, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, recp,
+                                            ptr, LASTREC_INSREC);
+        }
+        /*
+         * Return the new block number, if any.
+         * If there is one, give back a record value and a cursor too.
+         */
+        *ptrp = nptr;
+        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+                *recp = nrec;
+                *curp = ncur;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)
+{
+        int                     error;  /* error return value */
+        int                     i;      /* result value, 0 for failure */
+        int                     level;  /* current level number in btree */
+        union xfs_btree_ptr     nptr;   /* new block number (split result) */
+        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
+        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+        union xfs_btree_rec     rec;    /* record to insert */
+        level = 0;
+        ncur = NULL;
+        pcur = cur;
+        xfs_btree_set_ptr_null(cur, &nptr);
+        cur->bc_ops->init_rec_from_cur(cur, &rec);
+        /*
+         * Loop going up the tree, starting at the leaf level.
+         * Stop when we don't get a split block, that must mean that
+         * the insert is finished with this level.
+         */
+        do {
+                /*
+                 * Insert nrec/nptr into this level of the tree.
+                 * Note if we fail, nptr will be null.
+                 */
+                error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+                if (error) {
+                        if (pcur != cur)
+                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                        goto error0;
+                }
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                level++;
+                /*
+                 * See if the cursor we just used is trash.
+                 * Can't trash the caller's cursor, but otherwise we should
+                 * if ncur is a new cursor or we're about to be done.
+                 */
+                if (pcur != cur &&
+                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+                        /* Save the state from the cursor before we trash it */
+                        if (cur->bc_ops->update_cursor)
+                                cur->bc_ops->update_cursor(pcur, cur);
+                        cur->bc_nlevels = pcur->bc_nlevels;
+                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+                }
+                /* If we got a new cursor, switch to it. */
+                if (ncur) {
+                        pcur = ncur;
+                        ncur = NULL;
+                }
+        } while (!xfs_btree_ptr_is_null(cur, &nptr));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+int
+xfs_btree_kill_iroot(
+        struct xfs_btree_cur    *cur)
+{
+        int                     whichfork = cur->bc_private.b.whichfork;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_block  *block;
+        struct xfs_btree_block  *cblock;
+        union xfs_btree_key     *kp;
+        union xfs_btree_key     *ckp;
+        union xfs_btree_ptr     *pp;
+        union xfs_btree_ptr     *cpp;
+        struct xfs_buf          *cbp;
+        int                     level;
+        int                     index;
+        int                     numrecs;
+#ifdef DEBUG
+        union xfs_btree_ptr     ptr;
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        ASSERT(cur->bc_nlevels > 1);
+        /*
+         * Don't deal with the root block needs to be a leaf case.
+         * We're just going to turn the thing back into extents anyway.
+         */
+        level = cur->bc_nlevels - 1;
+        if (level == 1)
+                goto out0;
+        /*
+         * Give up if the root has multiple children.
+         */
+        block = xfs_btree_get_iroot(cur);
+        if (xfs_btree_get_numrecs(block) != 1)
+                goto out0;
+        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+        numrecs = xfs_btree_get_numrecs(cblock);
+        /*
+         * Only do this if the next level will fit.
+         * Then the data must be copied up to the inode,
+         * instead of freeing the root you free the next level.
+         */
+        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, killroot);
+#ifdef DEBUG
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+        if (index) {
+                xfs_iroot_realloc(cur->bc_private.b.ip, index,
+                                  cur->bc_private.b.whichfork);
+                block = ifp->if_broot;
+        }
+        be16_add_cpu(&block->bb_numrecs, index);
+        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < numrecs; i++) {
+                int             error;
+                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+        cur->bc_ops->free_block(cur, cbp);
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level - 1] = NULL;
+        be16_add_cpu(&block->bb_level, -1);
+        xfs_trans_log_inode(cur->bc_tp, ip,
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+        cur->bc_nlevels--;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+STATIC int
+xfs_btree_dec_cursor(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)
+{
+        int                     error;
+        int                     i;
+        if (level > 0) {
+                error = xfs_btree_decrement(cur, level, &i);
+                if (error)
+                        return error;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                                      /* error */
+xfs_btree_delrec(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     level,          /* level removing record from */
+        int                     *stat)          /* fail/done/go-on */
+{
+        struct xfs_btree_block  *block;         /* btree block */
+        union xfs_btree_ptr     cptr;           /* current block ptr */
+        struct xfs_buf          *bp;            /* buffer for block */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        union xfs_btree_key     key;            /* storage for keyp */
+        union xfs_btree_key     *keyp = &key;   /* passed to the next level */
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs = 0;      /* left record count */
+        int                     ptr;            /* key/record index */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        int                     rrecs = 0;      /* right record count */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        int                     numrecs;        /* temporary numrec count */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        tcur = NULL;
+        /* Get the index of the entry being deleted, check for nothing there. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Get the buffer & block containing the record or key/ptr. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we're off the end of the block. */
+        if (ptr > numrecs) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, delrec);
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+        /* Excise the entries being deleted. */
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+#ifdef DEBUG
+                for (i = 0; i < numrecs - ptr; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                if (ptr < numrecs) {
+                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need to pass a
+                 * key up to the next level (updkey).
+                 */
+                if (ptr == 1)
+                        keyp = xfs_btree_key_addr(cur, 1, block);
+        } else {
+                /* It's a leaf. operate on records */
+                if (ptr < numrecs) {
+                        xfs_btree_shift_recs(cur,
+                                xfs_btree_rec_addr(cur, ptr + 1, block),
+                                -1, numrecs - ptr);
+                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                if (ptr == 1) {
+                        cur->bc_ops->init_key_from_rec(&key,
+                                        xfs_btree_rec_addr(cur, 1, block));
+                        keyp = &key;
+                }
+        }
+        /*
+         * Decrement and log the number of entries in the block.
+         */
+        xfs_btree_set_numrecs(block, --numrecs);
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, NULL,
+                                            ptr, LASTREC_DELREC);
+        }
+        /*
+         * We're at the root level.  First, shrink the root block in-memory.
+         * Try to get rid of the next level down.  If we can't then there's
+         * nothing left to do.
+         */
+        if (level == cur->bc_nlevels - 1) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                                          cur->bc_private.b.whichfork);
+                        error = xfs_btree_kill_iroot(cur);
+                        if (error)
+                                goto error0;
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        *stat = 1;
+                        return 0;
+                }
+                /*
+                 * If this is the root level, and there's only one entry left,
+                 * and it's NOT the leaf level, then we can get rid of this
+                 * level.
+                 */
+                if (numrecs == 1 && level > 0) {
+                        union xfs_btree_ptr     *pp;
+                        /*
+                         * pp is still set to the first pointer in the block.
+                         * Make it the new root of the btree.
+                         */
+                        pp = xfs_btree_ptr_addr(cur, 1, block);
+                        error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                        if (error)
+                                goto error0;
+                } else if (level > 0) {
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                }
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we deleted the leftmost entry in the block, update the
+         * key values above us in the tree.
+         */
+        if (ptr == 1) {
+                error = xfs_btree_updkey(cur, keyp, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If the number of records remaining in the block is at least
+         * the minimum, we're done.
+         */
+        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        /*
+         * Otherwise, we have to move some records around to keep the
+         * tree balanced.  Look at the left and right sibling blocks to
+         * see if we can re-balance by moving only one record.
+         */
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                /*
+                 * One child of root, need to get a chance to copy its contents
+                 * into the root and delete it. Can't go up to next level,
+                 * there's nothing to delete there.
+                 */
+                if (xfs_btree_ptr_is_null(cur, &rptr) &&
+                    xfs_btree_ptr_is_null(cur, &lptr) &&
+                    level == cur->bc_nlevels - 2) {
+                        error = xfs_btree_kill_iroot(cur);
+                        if (!error)
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        return 0;
+                }
+        }
+        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+               !xfs_btree_ptr_is_null(cur, &lptr));
+        /*
+         * Duplicate the cursor so our btree manipulations here won't
+         * disrupt the next level up.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        /*
+         * If there's a right sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /*
+                 * Move the temp cursor to the last entry in the next block.
+                 * Actually any entry but the first would suffice.
+                 */
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_increment(tcur, level, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(tcur, right, level, rbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+                /*
+                 * If right block is full enough so that removing one entry
+                 * won't make it too empty, and left-shifting an entry out
+                 * of right to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(right) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_lshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                                if (error)
+                                        goto error0;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference, and fix up the temp cursor to point
+                 * to our block again (last record).
+                 */
+                rrecs = xfs_btree_get_numrecs(right);
+                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                        i = xfs_btree_firstrec(tcur, level);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        error = xfs_btree_decrement(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                }
+        }
+        /*
+         * If there's a left sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                /*
+                 * Move the temp cursor to the first entry in the
+                 * previous block.
+                 */
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_decrement(tcur, level, &i);
+                if (error)
+                        goto error0;
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, left, level, lbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+                /*
+                 * If left block is full enough so that removing one entry
+                 * won't make it too empty, and right-shifting an entry out
+                 * of left to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(left) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_rshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                if (level == 0)
+                                        cur->bc_ptrs[0]++;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference.
+                 */
+                lrecs = xfs_btree_get_numrecs(left);
+        }
+        /* Delete the temp cursor, we're done with it. */
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        tcur = NULL;
+        /* If here, we need to do a join to keep the tree balanced. */
+        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+            lrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "right" to be the starting block,
+                 * "left" to be the left neighbor.
+                 */
+                rptr = cptr;
+                right = block;
+                rbp = bp;
+                error = xfs_btree_read_buf_block(cur, &lptr, level,
+                                                        0, &left, &lbp);
+                if (error)
+                        goto error0;
+        /*
+         * If that won't work, see if we can join with the right neighbor block.
+         */
+        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+                   rrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "left" to be the starting block,
+                 * "right" to be the right neighbor.
+                 */
+                lptr = cptr;
+                left = block;
+                lbp = bp;
+                error = xfs_btree_read_buf_block(cur, &rptr, level,
+                                                        0, &right, &rbp);
+                if (error)
+                        goto error0;
+        /*
+         * Otherwise, we can't fix the imbalance.
+         * Just return.  This is probably a logic error, but it's not fatal.
+         */
+        } else {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        rrecs = xfs_btree_get_numrecs(right);
+        lrecs = xfs_btree_get_numrecs(left);
+        /*
+         * We're now going to join "left" and "right" by moving all the stuff
+         * in "right" to "left" and deleting "right".
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = 1; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        }
+        XFS_BTREE_STATS_INC(cur, join);
+        /*
+         * Fix up the the number of records and right block pointer in the
+         * surviving block, and log it.
+         */
+        xfs_btree_set_numrecs(left, lrecs + rrecs);
+        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /* If there is a right sibling, point it to the remaining block. */
+        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+                error = xfs_btree_read_buf_block(cur, &cptr, level,
+                                                        0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /* Free the deleted block. */
+        error = cur->bc_ops->free_block(cur, rbp);
+        if (error)
+                goto error0;
+        XFS_BTREE_STATS_INC(cur, free);
+        /*
+         * If we joined with the left neighbor, set the buffer in the
+         * cursor to the left block, and fix up the index.
+         */
+        if (bp != lbp) {
+                cur->bc_bufs[level] = lbp;
+                cur->bc_ptrs[level] += lrecs;
+                cur->bc_ra[level] = 0;
+        }
+        /*
+         * If we joined with the right neighbor and there's a level above
+         * us, increment the cursor at that level.
+         */
+        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+                   (level + 1 < cur->bc_nlevels)) {
+                error = xfs_btree_increment(cur, level + 1, &i);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * Readjust the ptr at this level if it's not a leaf, since it's
+         * still pointing at the deletion point, which makes the cursor
+         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+         * We can't use decrement because it would change the next level up.
+         */
+        if (level > 0)
+                cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        /* Return value means the next level up has something to do. */
+        *stat = 2;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        if (tcur)
+                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_btree_delete(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        int                     level;
+        int                     i;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        /*
+         * Go up the tree, starting at leaf level.
+         *
+         * If 2 is returned then a join was done; go to the next level.
+         * Otherwise we are done.
+         */
+        for (level = 0, i = 2; i == 2; level++) {
+                error = xfs_btree_delrec(cur, level, &i);
+                if (error)
+                        goto error0;
+        }
+        if (i == 0) {
+                for (level = 1; level < cur->bc_nlevels; level++) {
+                        if (cur->bc_ptrs[level] == 0) {
+                                error = xfs_btree_decrement(cur, level, &i);
+                                if (error)
+                                        goto error0;
+                                break;
+                        }
+                }
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_btree_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_rec     **recp, /* output: btree record */
+        int                     *stat)  /* output: success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer pointer */
+        int                     ptr;    /* record number */
+#ifdef DEBUG
+        int                     error;  /* error return value */
+#endif
+        ptr = cur->bc_ptrs[0];
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * Off the right end or left end, return failure.
+         */
+        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Point to the record and extract its data.
+         */
+        *recp = xfs_btree_rec_addr(cur, ptr, block);
+        *stat = 1;
+        return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 1f528a2a375..789fffdf8b2 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -39,39 +39,19 @@ extern kmem_zone_t	*xfs_btree_cur_zone;
 #define XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
 /*
- * Short form header: space allocation btrees.
+ * Generic btree header.
- */
+ *
-typedef struct xfs_btree_sblock {
+ * This is a comination of the actual format used on disk for short and long
-        __be32          bb_magic;       /* magic number for block type */
+ * format btrees.  The first three fields are shared by both format, but
-        __be16          bb_level;       /* 0 is a leaf */
+ * the pointers are different and should be used with care.
-        __be16          bb_numrecs;     /* current # of data records */
+ *
-        __be32          bb_leftsib;     /* left sibling block or NULLAGBLOCK */
+ * To get the size of the actual short or long form headers please use
-        __be32          bb_rightsib;    /* right sibling block or NULLAGBLOCK */
+ * the size macros below.  Never use sizeof(xfs_btree_block).
-} xfs_btree_sblock_t;
-/*
- * Long form header: bmap btrees.
- */
-typedef struct xfs_btree_lblock {
-        __be32          bb_magic;       /* magic number for block type */
-        __be16          bb_level;       /* 0 is a leaf */
-        __be16          bb_numrecs;     /* current # of data records */
-        __be64          bb_leftsib;     /* left sibling block or NULLDFSBNO */
-        __be64          bb_rightsib;    /* right sibling block or NULLDFSBNO */
-} xfs_btree_lblock_t;
-/*
- * Combined header and structure, used by common code.
 */
-typedef struct xfs_btree_hdr
+struct xfs_btree_block {
-{
        __be32          bb_magic;       /* magic number for block type */
        __be16          bb_level;       /* 0 is a leaf */
        __be16          bb_numrecs;     /* current # of data records */
-} xfs_btree_hdr_t;
-typedef struct xfs_btree_block {
-        xfs_btree_hdr_t bb_h;           /* header */
        union {
                struct {
                        __be32          bb_leftsib;
@@ -82,7 +62,36 @@ typedef struct xfs_btree_block {
                        __be64          bb_rightsib;
                } l;                    /* long form pointers */
        } bb_u;                         /* rest */
-} xfs_btree_block_t;
+};
+#define XFS_BTREE_SBLOCK_LEN    16      /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN    24      /* size of a long form block */
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+        __be32                  s;      /* short form ptr */
+        __be64                  l;      /* long form ptr */
+};
+union xfs_btree_key {
+        xfs_bmbt_key_t          bmbt;
+        xfs_bmdr_key_t          bmbr;   /* bmbt root block */
+        xfs_alloc_key_t         alloc;
+        xfs_inobt_key_t         inobt;
+};
+union xfs_btree_rec {
+        xfs_bmbt_rec_t          bmbt;
+        xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
+        xfs_alloc_rec_t         alloc;
+        xfs_inobt_rec_t         inobt;
+};
 /*
 * For logging record fields.
@@ -96,46 +105,131 @@ typedef struct xfs_btree_block {
 #define XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
 /*
- * Boolean to select which form of xfs_btree_block_t.bb_u to use.
- */
-#define XFS_BTREE_LONG_PTRS(btnum)      ((btnum) == XFS_BTNUM_BMAP)
-/*
 * Magic numbers for btree blocks.
 */
 extern const __uint32_t xfs_magics[];
 /*
- * Maximum and minimum records in a btree block.
+ * Generic stats interface
- * Given block size, type prefix, and leaf flag (0 or 1).
+ */
- * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
+#define __XFS_BTREE_STATS_INC(type, stat) \
- * compiler warnings.
+        XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
- */
+#define XFS_BTREE_STATS_INC(cur, stat)  \
-#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)       \
+do {    \
-        ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
+        switch (cur->bc_btnum) {  \
-         (((lf) * (uint)sizeof(t ## _rec_t)) + \
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
-          ((1 - (lf)) * \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
-           ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
-#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)       \
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
-        (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+        }       \
-/*
+} while (0)
- * Record, key, and pointer address calculation macros.
- * Given block size, type prefix, block pointer, and index of requested entry
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- * (first entry numbered 1).
+        XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
- */
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
-#define XFS_BTREE_REC_ADDR(t,bb,i)      \
+do {    \
-        ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        switch (cur->bc_btnum) {  \
-         ((i) - 1) * sizeof(t ## _rec_t)))
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
-#define XFS_BTREE_KEY_ADDR(t,bb,i)      \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
-        ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
-         ((i) - 1) * sizeof(t ## _key_t)))
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
-#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr)  \
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
-        ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        }       \
-         (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+} while (0)
 #define XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
+struct xfs_btree_ops {
+        /* size of the key and record structures */
+        size_t  key_len;
+        size_t  rec_len;
+        /* cursor operations */
+        struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+        void    (*update_cursor)(struct xfs_btree_cur *src,
+                                 struct xfs_btree_cur *dst);
+        /* update btree root pointer */
+        void    (*set_root)(struct xfs_btree_cur *cur,
+                                union xfs_btree_ptr *nptr, int level_change);
+        int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
+                                int level, union xfs_btree_ptr *newroot);
+        /* block allocation / freeing */
+        int     (*alloc_block)(struct xfs_btree_cur *cur,
+                               union xfs_btree_ptr *start_bno,
+                               union xfs_btree_ptr *new_bno,
+                               int length, int *stat);
+        int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+        /* update last record information */
+        void    (*update_lastrec)(struct xfs_btree_cur *cur,
+                                  struct xfs_btree_block *block,
+                                  union xfs_btree_rec *rec,
+                                  int ptr, int reason);
+        /* records in block/level */
+        int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+        int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+        /* records on disk.  Matter for the root in inode case. */
+        int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+        /* init values of btree structures */
+        void    (*init_key_from_rec)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_key)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_rec *rec);
+        void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_ptr *ptr);
+        /* difference between key value and cursor value */
+        __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+                              union xfs_btree_key *key);
+#ifdef DEBUG
+        /* check that k1 is lower than k2 */
+        int     (*keys_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_key *k1,
+                                union xfs_btree_key *k2);
+        /* check that r1 is lower than r2 */
+        int     (*recs_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_rec *r1,
+                                union xfs_btree_rec *r2);
+#endif
+        /* btree tracing */
+#ifdef XFS_BTREE_TRACE
+        void            (*trace_enter)(struct xfs_btree_cur *, const char *,
+                                       char *, int, int, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t);
+        void            (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+                                        __uint64_t *, __uint64_t *);
+        void            (*trace_key)(struct xfs_btree_cur *,
+                                     union xfs_btree_key *, __uint64_t *,
+                                     __uint64_t *);
+        void            (*trace_record)(struct xfs_btree_cur *,
+                                        union xfs_btree_rec *, __uint64_t *,
+                                        __uint64_t *, __uint64_t *);
+#endif
+};
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE  0
+#define LASTREC_INSREC  1
+#define LASTREC_DELREC  2
 /*
 * Btree cursor structure.
 * This collects all information needed by the btree code in one place.
@@ -144,6 +238,8 @@ typedef struct xfs_btree_cur
 {
        struct xfs_trans        *bc_tp; /* transaction we're in, if any */
        struct xfs_mount        *bc_mp; /* file system mount struct */
+        const struct xfs_btree_ops *bc_ops;
+        uint                    bc_flags; /* btree features - below */
        union {
                xfs_alloc_rec_incore_t  a;
                xfs_bmbt_irec_t         b;
@@ -175,94 +271,40 @@ typedef struct xfs_btree_cur
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS             (1<<0)  /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE         (1<<1)  /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE        (1<<2)  /* track last rec externally */
 #define XFS_BTREE_NOERROR       0
 #define XFS_BTREE_ERROR         1
 /*
 * Convert from buffer to btree block header.
 */
-#define XFS_BUF_TO_BLOCK(bp)    ((xfs_btree_block_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_LBLOCK(bp)   ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_SBLOCK(bp)   ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
-#ifdef __KERNEL__
-#ifdef DEBUG
 /*
- * Debug routine: check that block header is ok.
+ * Check that block header is ok.
 */
-void
+int
 xfs_btree_check_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_block_t       *block, /* generic btree block pointer */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
-        int                     level,  /* level of the btree block */
-        struct xfs_buf          *bp);   /* buffer containing block, if any */
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-        xfs_btnum_t             btnum,  /* btree identifier */
-        void                    *ak1,   /* pointer to left (lower) key */
-        void                    *ak2);  /* pointer to right (higher) key */
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-        xfs_btnum_t             btnum,  /* btree identifier */
-        void                    *ar1,   /* pointer to left (lower) record */
-        void                    *ar2);  /* pointer to right (higher) record */
-#else
-#define xfs_btree_check_block(a,b,c,d)
-#define xfs_btree_check_key(a,b,c)
-#define xfs_btree_check_rec(a,b,c)
-#endif  /* DEBUG */
-/*
- * Checking routine: check that long form block header is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_lblock_t      *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
        struct xfs_buf          *bp);   /* buffer containing block, if any */
 /*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (long) pointer is ok.
 */
 int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_dfsbno_t            ptr,    /* btree block disk address */
        int                     level); /* btree block level */
-#define xfs_btree_check_lptr_disk(cur, ptr, level) \
-        xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
-/*
- * Checking routine: check that short form block header is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_sblock_t      *block, /* btree short form block pointer */
-        int                     level,  /* level of the btree block */
-        struct xfs_buf          *bp);   /* buffer containing block */
-/*
- * Checking routine: check that (short) pointer is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agblock_t           ptr,    /* btree block disk address */
-        int                     level); /* btree block level */
 /*
 * Delete the btree cursor.
 */
@@ -281,15 +323,6 @@ xfs_btree_dup_cursor(
        xfs_btree_cur_t         **ncur);/* output cursor */
 /*
- * Change the cursor to point to the first record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_firstrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level); /* level to change */
-/*
 * Get a buffer for the block, return it with no data read.
 * Long-form addressing.
 */
@@ -313,20 +346,6 @@ xfs_btree_get_bufs(
        uint                    lock);  /* lock flags for get_buf */
 /*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B).
- */
-xfs_btree_cur_t *                       /* new btree cursor */
-xfs_btree_init_cursor(
-        struct xfs_mount        *mp,    /* file system mount point */
-        struct xfs_trans        *tp,    /* transaction pointer */
-        struct xfs_buf          *agbp,  /* (A only) buffer for agf structure */
-        xfs_agnumber_t          agno,   /* (A only) allocation group number */
-        xfs_btnum_t             btnum,  /* btree identifier */
-        struct xfs_inode        *ip,    /* (B only) inode owning the btree */
-        int                     whichfork); /* (B only) data/attr fork */
-/*
 * Check for the cursor referring to the last block at the given level.
 */
 int                                     /* 1=is last block, 0=not last block */
@@ -335,15 +354,6 @@ xfs_btree_islastblock(
        int                     level); /* level to check */
 /*
- * Change the cursor to point to the last record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_lastrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level); /* level to change */
-/*
 * Compute first and last byte offsets for the fields given.
 * Interprets the offsets table, which contains struct field offsets.
 */
@@ -404,39 +414,53 @@ xfs_btree_reada_bufs(
        xfs_extlen_t            count); /* count of filesystem blocks */
 /*
- * Read-ahead btree blocks, at the given level.
+ * Set the buffer for level "lev" in the cursor to bp, releasing
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ * any previous buffer.
 */
-int                                     /* readahead block count */
+void
-xfs_btree_readahead_core(
+xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
-        int                     lr);    /* left/right bits */
+        struct xfs_buf          *bp);   /* new buffer to set */
-static inline int                       /* readahead block count */
-xfs_btree_readahead(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     lev,    /* level in btree */
-        int                     lr)     /* left/right bits */
-{
-        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
-                return 0;
-        return xfs_btree_readahead_core(cur, lev, lr);
+/*
-}
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_kill_iroot(struct xfs_btree_cur *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
 /*
- * Set the buffer for level "lev" in the cursor to bp, releasing
+ * Helpers.
- * any previous buffer.
 */
-void
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
-xfs_btree_setbuf(
+{
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        return be16_to_cpu(block->bb_numrecs);
-        int                     lev,    /* level in btree */
+}
-        struct xfs_buf          *bp);   /* new buffer to set */
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+                __uint16_t numrecs)
+{
+        block->bb_numrecs = cpu_to_be16(numrecs);
+}
-#endif  /* __KERNEL__ */
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+        return be16_to_cpu(block->bb_level);
+}
 /*
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644
index 00000000000..44ff942a0fd
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+STATIC void
+xfs_btree_trace_ptr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     ptr,
+        __psunsigned_t          *high,
+        __psunsigned_t          *low)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                __u64 val = be64_to_cpu(ptr.l);
+                *high = val >> 32;
+                *low = (int)val;
+        } else {
+                *high = 0;
+                *low = be32_to_cpu(ptr.s);
+        }
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *b,
+        int                     i,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+                                 line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *b,
+        int                     i0,
+        int                     i1,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+                                 line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        xfs_dfiloff_t           o,
+        xfs_dfsbno_t            b,
+        xfs_dfilblks_t          i,
+        int                     j,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+                                 line,
+                                 o >> 32, (int)o,
+                                 b >> 32, (int)b,
+                                 i >> 32, (int)i,
+                                 (int)j, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+                                 line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_ptr     ptr,
+        union xfs_btree_key     *key,
+        int                     line)
+{
+        __psunsigned_t          high, low;
+        __uint64_t              l0, l1;
+        xfs_btree_trace_ptr(cur, ptr, &high, &low);
+        cur->bc_ops->trace_key(cur, key, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+                                 line, i, high, low,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_ptr     ptr,
+        union xfs_btree_rec     *rec,
+        int                     line)
+{
+        __psunsigned_t          high, low;
+        __uint64_t              l0, l1, l2;
+        xfs_btree_trace_ptr(cur, ptr, &high, &low);
+        cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+                              line, i,
+                              high, low,
+                              l0 >> 32, (int)l0,
+                              l1 >> 32, (int)l1,
+                              l2 >> 32, (int)l2,
+                              0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_key     *key,
+        int                     line)
+{
+        __uint64_t              l0, l1;
+        cur->bc_ops->trace_key(cur, key, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+                                 line, i,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 0, 0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        int                     line)
+{
+        __uint64_t              l0, l1, l2;
+        cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+                              line,
+                              l0 >> 32, (int)l0,
+                              l1 >> 32, (int)l1,
+                              l2 >> 32, (int)l2,
+                              0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     type,
+        int                     line)
+{
+        __uint32_t              s0;
+        __uint64_t              l0, l1;
+        char                    *s;
+        switch (type) {
+        case XBT_ARGS:
+                s = "args";
+                break;
+        case XBT_ENTRY:
+                s = "entry";
+                break;
+        case XBT_ERROR:
+                s = "error";
+                break;
+        case XBT_EXIT:
+                s = "exit";
+                break;
+        default:
+                s = "unknown";
+                break;
+        }
+        cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+                                 s0,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 (__psunsigned_t)cur->bc_bufs[0],
+                                 (__psunsigned_t)cur->bc_bufs[1],
+                                 (__psunsigned_t)cur->bc_bufs[2],
+                                 (__psunsigned_t)cur->bc_bufs[3],
+                                 (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+                                 (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644
index 00000000000..b3f5eb3c3c6
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define __XFS_BTREE_TRACE_H__
+struct xfs_btree_cur;
+struct xfs_buf;
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+#ifdef XFS_BTREE_TRACE
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI   1
+#define XFS_BTREE_KTRACE_ARGBII  2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI    4
+#define XFS_BTREE_KTRACE_ARGIPK  5
+#define XFS_BTREE_KTRACE_ARGIPR  6
+#define XFS_BTREE_KTRACE_ARGIK   7
+#define XFS_BTREE_KTRACE_ARGR    8
+#define XFS_BTREE_KTRACE_CUR     9
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS        0
+#define XBT_ENTRY       1
+#define XBT_ERROR       2
+#define XBT_EXIT        3
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+                struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+                struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
+                xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+                union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+#define XFS_ALLOCBT_TRACE_SIZE  4096    /* size of global trace buffer */
+extern ktrace_t *xfs_allocbt_trace_buf;
+#define XFS_INOBT_TRACE_SIZE    4096    /* size of global trace buffer */
+extern ktrace_t *xfs_inobt_trace_buf;
+#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
+extern ktrace_t *xfs_bmbt_trace_buf;
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)  \
+        xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)      \
+        xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)  \
+        xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGI(c, i)      \
+        xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k)      \
+        xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)      \
+        xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)  \
+        xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r)      \
+        xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define XFS_BTREE_TRACE_CURSOR(c, t)    \
+        xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
+#define XFS_BTREE_TRACE_ARGI(c, i)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define XFS_BTREE_TRACE_CURSOR(c, t)
+#endif  /* XFS_BTREE_TRACE */
+#endif /* __XFS_BTREE_TRACE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 002fc2617c8..92af4098c7e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -375,7 +375,7 @@ xfs_buf_item_unpin(
        xfs_buf_log_item_t      *bip,
        int                     stale)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
@@ -387,7 +387,7 @@ xfs_buf_item_unpin(
        xfs_buftrace("XFS_UNPIN", bp);
        freed = atomic_dec_and_test(&bip->bli_refcount);
-        mp = bip->bli_item.li_mountp;
+        ailp = bip->bli_item.li_ailp;
        xfs_bunpin(bp);
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -399,17 +399,17 @@ xfs_buf_item_unpin(
                xfs_buftrace("XFS_UNPIN STALE", bp);
                /*
                 * If we get called here because of an IO error, we may
-                 * or may not have the item on the AIL. xfs_trans_delete_ail()
+                 * or may not have the item on the AIL. xfs_trans_ail_delete()
                 * will take care of that situation.
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
-                        spin_lock(&mp->m_ail_lock);
+                        spin_lock(&ailp->xa_lock);
-                        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
                        xfs_buf_item_relse(bp);
                        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
                }
@@ -707,8 +707,8 @@ xfs_buf_item_init(
         * the first.  If we do already have one, there is
         * nothing to do here so return.
         */
-        if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
+        if (bp->b_mount != mp)
-                XFS_BUF_SET_FSPRIVATE3(bp, mp);
+                bp->b_mount = mp;
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -731,6 +731,7 @@ xfs_buf_item_init(
        bip->bli_item.li_type = XFS_LI_BUF;
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
+        bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -997,21 +998,7 @@ xfs_buf_iodone_callbacks(
                        xfs_buf_do_callbacks(bp, lip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
+                        xfs_biodone(bp);
-                        /*
-                         * XFS_SHUT flag gets set when we go thru the
-                         * entire buffer cache and deliberately start
-                         * throwing away delayed write buffers.
-                         * Since there's no biowait done on those,
-                         * we should just brelse them.
-                         */
-                        if (XFS_BUF_ISSHUT(bp)) {
-                            XFS_BUF_UNSHUT(bp);
-                                xfs_buf_relse(bp);
-                        } else {
-                                xfs_biodone(bp);
-                        }
                        return;
                }
@@ -1122,27 +1109,23 @@ xfs_buf_iodone(
        xfs_buf_t               *bp,
        xfs_buf_log_item_t      *bip)
 {
-        struct xfs_mount        *mp;
+        struct xfs_ail          *ailp = bip->bli_item.li_ailp;
        ASSERT(bip->bli_buf == bp);
        xfs_buf_rele(bp);
-        mp = bip->bli_item.li_mountp;
        /*
         * If we are forcibly shutting down, this may well be
         * off the AIL already. That's because we simulate the
         * log-committed callbacks to unpin these buffers. Or we may never
         * have put this item on AIL because of the transaction was
-         * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+         * aborted forcibly. xfs_trans_ail_delete() takes care of these.
         *
         * Either way, AIL is useless if we're forcing a shutdown.
         */
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
-        /*
+        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
-         * xfs_trans_delete_ail() drops the AIL lock.
-         */
-        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
        xfs_buf_item_free(bip);
 }
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644
index d2ce5dd70d8..00000000000
--- a/fs/xfs/xfs_clnt.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CLNT_H__
-#define __XFS_CLNT_H__
-/*
- * XFS arguments structure, constructed from the arguments we
- * are passed via the mount system call.
- *
- * NOTE: The mount system call is handled differently between
- * Linux and IRIX.  In IRIX we worked work with a binary data
- * structure coming in across the syscall interface from user
- * space (the mount userspace knows about each filesystem type
- * and the set of valid options for it, and converts the users
- * argument string into a binary structure _before_ making the
- * system call), and the ABI issues that this implies.
- *
- * In Linux, we are passed a comma separated set of options;
- * ie. a NULL terminated string of characters.  Userspace mount
- * code does not have any knowledge of mount options expected by
- * each filesystem type and so each filesystem parses its mount
- * options in kernel space.
- *
- * For the Linux port, we kept this structure pretty much intact
- * and use it internally (because the existing code groks it).
- */
-struct xfs_mount_args {
-        int     flags;          /* flags -> see XFSMNT_... macros below */
-        int     flags2;         /* flags -> see XFSMNT2_... macros below */
-        int     logbufs;        /* Number of log buffers, -1 to default */
-        int     logbufsize;     /* Size of log buffers, -1 to default */
-        char    fsname[MAXNAMELEN+1];   /* data device name */
-        char    rtname[MAXNAMELEN+1];   /* realtime device filename */
-        char    logname[MAXNAMELEN+1];  /* journal device filename */
-        char    mtpt[MAXNAMELEN+1];     /* filesystem mount point */
-        int     sunit;          /* stripe unit (BBs) */
-        int     swidth;         /* stripe width (BBs), multiple of sunit */
-        uchar_t iosizelog;      /* log2 of the preferred I/O size */
-        int     ihashsize;      /* inode hash table size (buckets) */
-};
-/*
- * XFS mount option flags -- args->flags1
- */
-#define XFSMNT_ATTR2            0x00000001      /* allow ATTR2 EA format */
-#define XFSMNT_WSYNC            0x00000002      /* safe mode nfs mount
-                                                 * compatible */
-#define XFSMNT_INO64            0x00000004      /* move inode numbers up
-                                                 * past 2^32 */
-#define XFSMNT_UQUOTA           0x00000008      /* user quota accounting */
-#define XFSMNT_PQUOTA           0x00000010      /* IRIX prj quota accounting */
-#define XFSMNT_UQUOTAENF        0x00000020      /* user quota limit
-                                                 * enforcement */
-#define XFSMNT_PQUOTAENF        0x00000040      /* IRIX project quota limit
-                                                 * enforcement */
-#define XFSMNT_QUIET            0x00000080      /* don't report mount errors */
-#define XFSMNT_NOALIGN          0x00000200      /* don't allocate at
-                                                 * stripe boundaries*/
-#define XFSMNT_RETERR           0x00000400      /* return error to user */
-#define XFSMNT_NORECOVERY       0x00000800      /* no recovery, implies
-                                                 * read-only mount */
-#define XFSMNT_SHARED           0x00001000      /* shared XFS mount */
-#define XFSMNT_IOSIZE           0x00002000      /* optimize for I/O size */
-#define XFSMNT_OSYNCISOSYNC     0x00004000      /* o_sync is REALLY o_sync */
-                                                /* (osyncisdsync is default) */
-#define XFSMNT_NOATTR2          0x00008000      /* turn off ATTR2 EA format */
-#define XFSMNT_32BITINODES      0x00200000      /* restrict inodes to 32
-                                                 * bits of address space */
-#define XFSMNT_GQUOTA           0x00400000      /* group quota accounting */
-#define XFSMNT_GQUOTAENF        0x00800000      /* group quota limit
-                                                 * enforcement */
-#define XFSMNT_NOUUID           0x01000000      /* Ignore fs uuid */
-#define XFSMNT_DMAPI            0x02000000      /* enable dmapi/xdsm */
-#define XFSMNT_BARRIER          0x04000000      /* use write barriers */
-#define XFSMNT_IKEEP            0x08000000      /* inode cluster delete */
-#define XFSMNT_SWALLOC          0x10000000      /* turn on stripe width
-                                                 * allocation */
-#define XFSMNT_DIRSYNC          0x40000000      /* sync creat,link,unlink,rename
-                                                 * symlink,mkdir,rmdir,mknod */
-#define XFSMNT_FLAGS2           0x80000000      /* more flags set in flags2 */
-/*
- * XFS mount option flags -- args->flags2
- */
-#define XFSMNT2_COMPAT_IOSIZE   0x00000001      /* don't report large preferred
-                                                 * I/O size in stat(2) */
-#define XFSMNT2_FILESTREAMS     0x00000002      /* enable the filestreams
-                                                 * allocator */
-#endif  /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8be0b00ede9..70b710c1792 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -72,27 +72,7 @@ typedef struct xfs_da_intnode {
 typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
 typedef struct xfs_da_node_entry xfs_da_node_entry_t;
-#define XFS_DA_MAXHASH  ((xfs_dahash_t)-1) /* largest valid hash value */
 #define XFS_LBSIZE(mp)  (mp)->m_sb.sb_blocksize
-#define XFS_LBLOG(mp)   (mp)->m_sb.sb_blocklog
-#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry)      \
-        (((bno) << (mp)->m_dircook_elog) | (entry))
-#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)   \
-        (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
-#define XFS_DA_COOKIE_HASH(mp,cookie)           ((xfs_dahash_t)cookie)
-#define XFS_DA_COOKIE_BNO(mp,cookie)            \
-        ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-                (xfs_dablk_t)0 : \
-                (xfs_dablk_t)((xfs_off_t)(cookie) >> \
-                                ((mp)->m_dircook_elog + 32))))
-#define XFS_DA_COOKIE_ENTRY(mp,cookie)          \
-        ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-                (xfs_dablk_t)0 : \
-                (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
-                                ((1 << (mp)->m_dircook_elog) - 1))))
 /*========================================================================
 * Btree searching and modification structure definitions.
@@ -226,9 +206,8 @@ struct xfs_nameops {
 };
-#ifdef __KERNEL__
 /*========================================================================
- * Function prototypes for the kernel.
+ * Function prototypes.
 *========================================================================*/
 /*
@@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 75b0cd4da0e..b4c1ee71349 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -49,9 +49,8 @@
 */
 int
 xfs_swapext(
-        xfs_swapext_t   __user *sxu)
+        xfs_swapext_t   *sxp)
 {
-        xfs_swapext_t   *sxp;
        xfs_inode_t     *ip, *tip;
        struct file     *file, *target_file;
        int             error = 0;
@@ -62,11 +61,6 @@ xfs_swapext(
                goto out;
        }
-        if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) {
-                error = XFS_ERROR(EFAULT);
-                goto out_free_sxp;
-        }
        /* Pull information for the target fd */
        file = fget((int)sxp->sx_fdtarget);
        if (!file) {
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index da178205be6..4f55a630655 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -46,7 +46,7 @@ typedef struct xfs_swapext
 /*
 * Syscall interface for xfs_swapext
 */
-int     xfs_swapext(struct xfs_swapext __user *sx);
+int     xfs_swapext(struct xfs_swapext *sx);
 int     xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
                struct xfs_swapext *sxp);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index c9065eaf2a4..162e8726df5 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -18,32 +18,29 @@
 #ifndef __XFS_DINODE_H__
 #define __XFS_DINODE_H__
-struct xfs_buf;
+#define XFS_DINODE_MAGIC                0x494e  /* 'IN' */
-struct xfs_mount;
+#define XFS_DINODE_GOOD_VERSION(v)      (((v) == 1 || (v) == 2))
-#define XFS_DINODE_VERSION_1    1
-#define XFS_DINODE_VERSION_2    2
-#define XFS_DINODE_GOOD_VERSION(v)      \
-        (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2))
-#define XFS_DINODE_MAGIC        0x494e  /* 'IN' */
-/*
- * Disk inode structure.
- * This is just the header; the inode is expanded to fill a variable size
- * with the last field expanding.  It is split into the core and "other"
- * because we only need the core part in the in-core inode.
- */
 typedef struct xfs_timestamp {
        __be32          t_sec;          /* timestamp seconds */
        __be32          t_nsec;         /* timestamp nanoseconds */
 } xfs_timestamp_t;
 /*
- * Note: Coordinate changes to this structure with the XFS_DI_* #defines
+ * On-disk inode structure.
- * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode
+ *
- * in xfs_inode.h.
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
 */
-typedef struct xfs_dinode_core {
+typedef struct xfs_dinode {
        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
        __be16          di_mode;        /* mode and type of file */
        __u8            di_version;     /* inode version */
@@ -69,34 +66,12 @@ typedef struct xfs_dinode_core {
        __be16          di_dmstate;     /* DMIG state info */
        __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
        __be32          di_gen;         /* generation number */
-} xfs_dinode_core_t;
-#define DI_MAX_FLUSH 0xffff
+        /* di_next_unlinked is the only non-core field in the old dinode */
+        __be32          di_next_unlinked;/* agi unlinked list ptr */
+} __attribute__((packed)) xfs_dinode_t;
-typedef struct xfs_dinode
+#define DI_MAX_FLUSH 0xffff
-{
-        xfs_dinode_core_t       di_core;
-        /*
-         * In adding anything between the core and the union, be
-         * sure to update the macros like XFS_LITINO below and
-         * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
-         */
-        __be32                  di_next_unlinked;/* agi unlinked list ptr */
-        union {
-                xfs_bmdr_block_t di_bmbt;       /* btree root block */
-                xfs_bmbt_rec_32_t di_bmx[1];    /* extent list */
-                xfs_dir2_sf_t   di_dir2sf;      /* shortform directory v2 */
-                char            di_c[1];        /* local contents */
-                __be32          di_dev;         /* device for S_IFCHR/S_IFBLK */
-                uuid_t          di_muuid;       /* mount point value */
-                char            di_symlink[1];  /* local symbolic link */
-        }               di_u;
-        union {
-                xfs_bmdr_block_t di_abmbt;      /* btree root block */
-                xfs_bmbt_rec_32_t di_abmx[1];   /* extent list */
-                xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
-        }               di_a;
-} xfs_dinode_t;
 /*
 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
@@ -107,50 +82,14 @@ typedef struct xfs_dinode
 #define XFS_MAXLINK_1           65535U
 /*
- * Bit names for logging disk inodes only
- */
-#define XFS_DI_MAGIC            0x0000001
-#define XFS_DI_MODE             0x0000002
-#define XFS_DI_VERSION          0x0000004
-#define XFS_DI_FORMAT           0x0000008
-#define XFS_DI_ONLINK           0x0000010
-#define XFS_DI_UID              0x0000020
-#define XFS_DI_GID              0x0000040
-#define XFS_DI_NLINK            0x0000080
-#define XFS_DI_PROJID           0x0000100
-#define XFS_DI_PAD              0x0000200
-#define XFS_DI_ATIME            0x0000400
-#define XFS_DI_MTIME            0x0000800
-#define XFS_DI_CTIME            0x0001000
-#define XFS_DI_SIZE             0x0002000
-#define XFS_DI_NBLOCKS          0x0004000
-#define XFS_DI_EXTSIZE          0x0008000
-#define XFS_DI_NEXTENTS         0x0010000
-#define XFS_DI_NAEXTENTS        0x0020000
-#define XFS_DI_FORKOFF          0x0040000
-#define XFS_DI_AFORMAT          0x0080000
-#define XFS_DI_DMEVMASK         0x0100000
-#define XFS_DI_DMSTATE          0x0200000
-#define XFS_DI_FLAGS            0x0400000
-#define XFS_DI_GEN              0x0800000
-#define XFS_DI_NEXT_UNLINKED    0x1000000
-#define XFS_DI_U                0x2000000
-#define XFS_DI_A                0x4000000
-#define XFS_DI_NUM_BITS         27
-#define XFS_DI_ALL_BITS         ((1 << XFS_DI_NUM_BITS) - 1)
-#define XFS_DI_CORE_BITS        (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
-/*
 * Values for di_format
 */
-typedef enum xfs_dinode_fmt
+typedef enum xfs_dinode_fmt {
-{
+        XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
-        XFS_DINODE_FMT_DEV,             /* CHR, BLK: di_dev */
+        XFS_DINODE_FMT_LOCAL,           /* bulk data */
-        XFS_DINODE_FMT_LOCAL,           /* DIR, REG: di_c */
+        XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
-                                        /* LNK: di_symlink */
+        XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
-        XFS_DINODE_FMT_EXTENTS,         /* DIR, REG, LNK: di_bmx */
+        XFS_DINODE_FMT_UUID             /* uuid_t */
-        XFS_DINODE_FMT_BTREE,           /* DIR, REG, LNK: di_bmbt */
-        XFS_DINODE_FMT_UUID             /* MNT: di_uuid */
 } xfs_dinode_fmt_t;
 /*
@@ -166,13 +105,13 @@ typedef enum xfs_dinode_fmt
 */
 #define XFS_LITINO(mp)  ((mp)->m_litino)
 #define XFS_BROOT_SIZE_ADJ      \
-        (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+        (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
 /*
 * Inode data & attribute fork sizes, per inode.
 */
-#define XFS_DFORK_Q(dip)                ((dip)->di_core.di_forkoff != 0)
+#define XFS_DFORK_Q(dip)                ((dip)->di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_core.di_forkoff << 3))
+#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_forkoff << 3))
 #define XFS_DFORK_DSIZE(dip,mp) \
        (XFS_DFORK_Q(dip) ? \
@@ -187,23 +126,42 @@ typedef enum xfs_dinode_fmt
                XFS_DFORK_DSIZE(dip, mp) : \
                XFS_DFORK_ASIZE(dip, mp))
-#define XFS_DFORK_DPTR(dip)                 ((dip)->di_u.di_c)
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+        ((char *)(dip) + sizeof(struct xfs_dinode))
 #define XFS_DFORK_APTR(dip)     \
-        ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
+        (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
 #define XFS_DFORK_PTR(dip,w)    \
        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
 #define XFS_DFORK_FORMAT(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-                (dip)->di_core.di_format : \
+                (dip)->di_format : \
-                (dip)->di_core.di_aformat)
+                (dip)->di_aformat)
 #define XFS_DFORK_NEXTENTS(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-                be32_to_cpu((dip)->di_core.di_nextents) : \
+                be32_to_cpu((dip)->di_nextents) : \
-                be16_to_cpu((dip)->di_core.di_anextents))
+                be16_to_cpu((dip)->di_anextents))
 #define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)XFS_BUF_PTR(bp))
 /*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+        return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+        *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+/*
 * Values for di_flags
 * There should be a one-to-one correspondence between these flags and the
 * XFS_XFLAG_s.
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index deecc9d238f..6ac44b550d3 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -34,13 +34,6 @@ struct xfs_mount;
 struct xfs_trans;
 /*
- * Maximum size of a shortform directory.
- */
-#define XFS_DIR2_SF_MAX_SIZE    \
-        (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
-         (uint)sizeof(xfs_agino_t))
-/*
 * Inode number stored as 8 8-bit values.
 */
 typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index a1e55fb9d5d..e71e2581c0c 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -25,7 +25,6 @@
 #include "xfs_inum.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_clnt.h"
 static struct xfs_dmops xfs_dmcore_stub = {
@@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = {
 };
 int
-xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_dmops_get(struct xfs_mount *mp)
 {
-        if (args->flags & XFSMNT_DMAPI) {
+        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                cmn_err(CE_WARN,
                        "XFS: dmapi support not available in this kernel.");
                return EINVAL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f227ecd1a29..92d5cd5bf4f 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -153,21 +153,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
-static void
-xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
-{
-        if (mp != NULL) {
-                char    *newfmt;
-                int     len = 16 + mp->m_fsname_len + strlen(fmt);
-                newfmt = kmem_alloc(len, KM_SLEEP);
-                sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
-                icmn_err(level, newfmt, ap);
-                kmem_free(newfmt);
-        } else {
-                icmn_err(level, fmt, ap);
-        }
-}
 void
 xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 11543f10b0c..0c93051c465 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -159,11 +159,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
 struct xfs_mount;
-/* PRINTFLIKE4 */
+extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
+                char *fmt, va_list ap)
+        __attribute__ ((format (printf, 3, 0)));
 extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...);
+                        char *fmt, ...)
-/* PRINTFLIKE3 */
+        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_hex_dump(void *p, int length);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 8aa28f751b2..05a4bdd4be3 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 STATIC void
 xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        mp = efip->efi_item.li_mountp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                /*
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 STATIC void
 xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
        xfs_log_item_desc_t     *lidp;
-        mp = efip->efi_item.li_mountp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
                /*
                 * free the xaction descriptor pointing to this item
                 */
                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
                xfs_trans_free_item(tp, lidp);
-                /*
-                 * pull the item off the AIL.
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t	*mp,
        efip->efi_item.li_type = XFS_LI_EFI;
        efip->efi_item.li_ops = &xfs_efi_item_ops;
        efip->efi_item.li_mountp = mp;
+        efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -345,25 +340,22 @@ void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        int             extents_left;
+        int                     extents_left;
-        mp = efip->efi_item.li_mountp;
        ASSERT(efip->efi_next_extent > 0);
        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
        ASSERT(efip->efi_next_extent >= nextents);
        efip->efi_next_extent -= nextents;
        extents_left = efip->efi_next_extent;
        if (extents_left == 0) {
-                /*
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t	*mp,
        efdp->efd_item.li_type = XFS_LI_EFD;
        efdp->efd_item.li_ops = &xfs_efd_item_ops;
        efdp->efd_item.li_mountp = mp;
+        efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 01c0cc88d3f..589c41c3844 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -113,22 +113,14 @@ struct getbmapx {
 #define BMV_IF_ATTRFORK         0x1     /* return attr fork rather than data */
 #define BMV_IF_NO_DMAPI_READ    0x2     /* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC         0x4     /* rtn status BMV_OF_PREALLOC if req */
-#define BMV_IF_VALID    (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
+#define BMV_IF_DELALLOC         0x8     /* rtn status BMV_OF_DELALLOC if req */
-#ifdef __KERNEL__
+#define BMV_IF_VALID    \
-#define BMV_IF_EXTENDED 0x40000000      /* getpmapx if set */
+        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
-#endif
 /*      bmv_oflags values - returned for for each non-header segment */
 #define BMV_OF_PREALLOC         0x1     /* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC         0x2     /* segment = delayed allocation */
-/*      Convert getbmap <-> getbmapx - move fields from p1 to p2. */
+#define BMV_OF_LAST             0x4     /* segment is the last in the file */
-#define GETBMAP_CONVERT(p1,p2) {        \
-        p2.bmv_offset = p1.bmv_offset;  \
-        p2.bmv_block = p1.bmv_block;    \
-        p2.bmv_length = p1.bmv_length;  \
-        p2.bmv_count = p1.bmv_count;    \
-        p2.bmv_entries = p1.bmv_entries;  }
 /*
 * Structure for XFS_IOC_FSSETDM.
@@ -426,10 +418,6 @@ typedef struct xfs_handle {
 #define XFS_IOC_GETXFLAGS       FS_IOC_GETFLAGS
 #define XFS_IOC_SETXFLAGS       FS_IOC_SETFLAGS
 #define XFS_IOC_GETVERSION      FS_IOC_GETVERSION
-/* 32-bit compat counterparts */
-#define XFS_IOC32_GETXFLAGS     FS_IOC32_GETFLAGS
-#define XFS_IOC32_SETXFLAGS     FS_IOC32_SETFLAGS
-#define XFS_IOC32_GETVERSION    FS_IOC32_GETVERSION
 /*
 * ioctl commands that replace IRIX fcntl()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 84583cf73db..852b6d32e8d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -126,7 +126,7 @@ xfs_growfs_data_private(
        xfs_extlen_t            agsize;
        xfs_extlen_t            tmpsize;
        xfs_alloc_rec_t         *arec;
-        xfs_btree_sblock_t      *block;
+        struct xfs_btree_block  *block;
        xfs_buf_t               *bp;
        int                     bucket;
        int                     dpct;
@@ -251,14 +251,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -272,14 +272,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -294,13 +294,13 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = 0;
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
                error = xfs_bwrite(mp, bp);
                if (error) {
                        goto error0;
@@ -435,6 +435,9 @@ xfs_growfs_data(
        xfs_growfs_data_t       *in)
 {
        int error;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_data_private(mp, in);
@@ -448,6 +451,9 @@ xfs_growfs_log(
        xfs_growfs_log_t        *in)
 {
        int error;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_log_private(mp, in);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index aad8c5da38a..e6ebbaeb4dc 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -41,68 +41,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-/*
- * Log specified fields for the inode given by bp and off.
- */
-STATIC void
-xfs_ialloc_log_di(
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_buf_t       *bp,            /* inode buffer */
-        int             off,            /* index of inode in buffer */
-        int             fields)         /* bitmask of fields to log */
-{
-        int                     first;          /* first byte number */
-        int                     ioffset;        /* off in bytes */
-        int                     last;           /* last byte number */
-        xfs_mount_t             *mp;            /* mount point structure */
-        static const short      offsets[] = {   /* field offsets */
-                                                /* keep in sync with bits */
-                offsetof(xfs_dinode_core_t, di_magic),
-                offsetof(xfs_dinode_core_t, di_mode),
-                offsetof(xfs_dinode_core_t, di_version),
-                offsetof(xfs_dinode_core_t, di_format),
-                offsetof(xfs_dinode_core_t, di_onlink),
-                offsetof(xfs_dinode_core_t, di_uid),
-                offsetof(xfs_dinode_core_t, di_gid),
-                offsetof(xfs_dinode_core_t, di_nlink),
-                offsetof(xfs_dinode_core_t, di_projid),
-                offsetof(xfs_dinode_core_t, di_pad),
-                offsetof(xfs_dinode_core_t, di_atime),
-                offsetof(xfs_dinode_core_t, di_mtime),
-                offsetof(xfs_dinode_core_t, di_ctime),
-                offsetof(xfs_dinode_core_t, di_size),
-                offsetof(xfs_dinode_core_t, di_nblocks),
-                offsetof(xfs_dinode_core_t, di_extsize),
-                offsetof(xfs_dinode_core_t, di_nextents),
-                offsetof(xfs_dinode_core_t, di_anextents),
-                offsetof(xfs_dinode_core_t, di_forkoff),
-                offsetof(xfs_dinode_core_t, di_aformat),
-                offsetof(xfs_dinode_core_t, di_dmevmask),
-                offsetof(xfs_dinode_core_t, di_dmstate),
-                offsetof(xfs_dinode_core_t, di_flags),
-                offsetof(xfs_dinode_core_t, di_gen),
-                offsetof(xfs_dinode_t, di_next_unlinked),
-                offsetof(xfs_dinode_t, di_u),
-                offsetof(xfs_dinode_t, di_a),
-                sizeof(xfs_dinode_t)
-        };
-        ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
-        ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
-        mp = tp->t_mountp;
-        /*
-         * Get the inode-relative first and last bytes for these fields
-         */
-        xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
-        /*
-         * Convert to buffer offsets and log it.
-         */
-        ioffset = off << mp->m_sb.sb_inodelog;
-        first += ioffset;
-        last += ioffset;
-        xfs_trans_log_buf(tp, bp, first, last);
-}
 /*
 * Allocation group level functions.
@@ -119,6 +57,102 @@ xfs_ialloc_cluster_alignment(
 }
 /*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_inobt_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_inobt_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free)   /* free inode mask */
+{
+        union xfs_btree_rec     rec;
+        rec.inobt.ir_startino = cpu_to_be32(ino);
+        rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+        rec.inobt.ir_free = cpu_to_be64(free);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_inobt_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             *ino,   /* output: starting inode of chunk */
+        __int32_t               *fcnt,  /* output: number of free inodes */
+        xfs_inofree_t           *free,  /* output: free inode mask */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                *ino = be32_to_cpu(rec->inobt.ir_startino);
+                *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+                *free = be64_to_cpu(rec->inobt.ir_free);
+        }
+        return error;
+}
+/*
 * Allocate new inodes in the allocation group specified by agbp.
 * Return 0 for success, else error code.
 */
@@ -287,9 +321,9 @@ xfs_ialloc_ag_alloc(
         * able to use the file system.
         */
        if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-                version = XFS_DINODE_VERSION_2;
+                version = 2;
        else
-                version = XFS_DINODE_VERSION_1;
+                version = 1;
        /*
         * Seed the new inode cluster with a random generation number. This
@@ -310,18 +344,25 @@ xfs_ialloc_ag_alloc(
                                         XFS_BUF_LOCK);
                ASSERT(fbuf);
                ASSERT(!XFS_BUF_GETERROR(fbuf));
                /*
-                 * Set initial values for the inodes in this buffer.
+                 * Initialize all inodes in this buffer and then log them.
+                 *
+                 * XXX: It would be much better if we had just one transaction to
+                 *      log a whole cluster of inodes instead of all the indivdual
+                 *      transactions causing a lot of log traffic.
                 */
                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
+                        int     ioffset = i << args.mp->m_sb.sb_inodelog;
+                        uint    isize = sizeof(struct xfs_dinode);
                        free = XFS_MAKE_IPTR(args.mp, fbuf, i);
-                        free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-                        free->di_core.di_version = version;
+                        free->di_version = version;
-                        free->di_core.di_gen = cpu_to_be32(gen);
+                        free->di_gen = cpu_to_be32(gen);
                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        xfs_ialloc_log_di(tp, fbuf, i,
+                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
-                                XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
                }
                xfs_trans_inode_alloc_buf(tp, fbuf);
        }
@@ -335,8 +376,7 @@ xfs_ialloc_ag_alloc(
        /*
         * Insert records describing the new inode chunk into the btree.
         */
-        cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
+        cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
-                        XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
        for (thisino = newino;
             thisino < newino + newlen;
             thisino += XFS_INODES_PER_CHUNK) {
@@ -346,7 +386,7 @@ xfs_ialloc_ag_alloc(
                        return error;
                }
                ASSERT(i == 0);
-                if ((error = xfs_inobt_insert(cur, &i))) {
+                if ((error = xfs_btree_insert(cur, &i))) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
@@ -676,8 +716,7 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
-        cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
-                                    XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
         * This must work because we've just allocated some.
@@ -697,7 +736,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
@@ -741,7 +780,7 @@ nextag:
                        /*
                         * Search left with tcur, back up 1 record.
                         */
-                        if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+                        if ((error = xfs_btree_decrement(tcur, 0, &i)))
                                goto error1;
                        doneleft = !i;
                        if (!doneleft) {
@@ -755,7 +794,7 @@ nextag:
                        /*
                         * Search right with cur, go forward 1 record.
                         */
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error1;
                        doneright = !i;
                        if (!doneright) {
@@ -817,7 +856,7 @@ nextag:
                                 * further left.
                                 */
                                if (useleft) {
-                                        if ((error = xfs_inobt_decrement(tcur, 0,
+                                        if ((error = xfs_btree_decrement(tcur, 0,
                                                        &i)))
                                                goto error1;
                                        doneleft = !i;
@@ -837,7 +876,7 @@ nextag:
                                 * further right.
                                 */
                                else {
-                                        if ((error = xfs_inobt_increment(cur, 0,
+                                        if ((error = xfs_btree_increment(cur, 0,
                                                        &i)))
                                                goto error1;
                                        doneright = !i;
@@ -892,7 +931,7 @@ nextag:
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (rec.ir_freecount > 0)
                                        break;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        }
@@ -926,7 +965,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
@@ -1022,8 +1061,7 @@ xfs_difree(
        /*
         * Initialize the cursor.
         */
-        cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                (xfs_inode_t *)0, 0);
 #ifdef DEBUG
        if (cur->bc_nlevels == 1) {
                int freecount = 0;
@@ -1036,7 +1074,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1098,8 +1136,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
-                if ((error = xfs_inobt_delete(cur, &i))) {
+                if ((error = xfs_btree_delete(cur, &i))) {
-                        cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
+                        cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1141,7 +1179,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1158,36 +1196,28 @@ error0:
 }
 /*
- * Return the location of the inode in bno/off, for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
 */
-/*ARGSUSED*/
 int
-xfs_dilocate(
+xfs_imap(
-        xfs_mount_t     *mp,    /* file system mount structure */
+        xfs_mount_t      *mp,   /* file system mount structure */
-        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_trans_t      *tp,   /* transaction pointer */
        xfs_ino_t       ino,    /* inode to locate */
-        xfs_fsblock_t   *bno,   /* output: block containing inode */
+        struct xfs_imap *imap,  /* location map structure */
-        int             *len,   /* output: num blocks in inode cluster */
+        uint            flags)  /* flags for inode btree lookup */
-        int             *off,   /* output: index in block of inode */
-        uint            flags)  /* flags concerning inode lookup */
 {
        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
-        xfs_buf_t       *agbp;  /* agi buffer */
        xfs_agino_t     agino;  /* inode number within alloc group */
        xfs_agnumber_t  agno;   /* allocation group number */
        int             blks_per_cluster; /* num blocks per inode cluster */
        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
-        xfs_agino_t     chunk_agino;    /* first agino in inode chunk */
-        __int32_t       chunk_cnt;      /* count of free inodes in chunk */
-        xfs_inofree_t   chunk_free;     /* mask of free inodes in chunk */
        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
-        xfs_btree_cur_t *cur;   /* inode btree cursor */
        int             error;  /* error code */
-        int             i;      /* temp state */
        int             offset; /* index of inode in its buffer */
        int             offset_agbno;   /* blks from chunk start to inode */
        ASSERT(ino != NULLFSINO);
        /*
         * Split up the inode number into its parts.
         */
@@ -1198,24 +1228,24 @@ xfs_dilocate(
            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
                /* no diagnostics for bulkstat, ino comes from userspace */
-                if (flags & XFS_IMAP_BULKSTAT)
+                if (flags & XFS_IGET_BULKSTAT)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: agno (%d) >= "
+                                        "xfs_imap: agno (%d) >= "
                                        "mp->m_sb.sb_agcount (%d)",
                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: agbno (0x%llx) >= "
+                                        "xfs_imap: agbno (0x%llx) >= "
                                        "mp->m_sb.sb_agblocks (0x%lx)",
                                        (unsigned long long) agbno,
                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: ino (0x%llx) != "
+                                        "xfs_imap: ino (0x%llx) != "
                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
                                        "(0x%llx)",
                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
@@ -1224,65 +1254,89 @@ xfs_dilocate(
 #endif /* DEBUG */
                return XFS_ERROR(EINVAL);
        }
-        if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
-            !(flags & XFS_IMAP_LOOKUP)) {
+        /*
+         * If the inode cluster size is the same as the blocksize or
+         * smaller we get to the buffer by simple arithmetics.
+         */
+        if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-                *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
-                *off = offset;
+                imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                *len = 1;
+                imap->im_len = XFS_FSB_TO_BB(mp, 1);
+                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
-        if (*bno != NULLFSBLOCK) {
+        /*
+         * If we get a block number passed from bulkstat we can use it to
+         * find the buffer easily.
+         */
+        if (imap->im_blkno) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-                cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
-                *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+                cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
-                        offset;
+                offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
-                *len = blks_per_cluster;
+                imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
+        /*
+         * If the inode chunks are aligned then use simple maths to
+         * find the location. Otherwise we have to do a btree
+         * lookup to find the location.
+         */
        if (mp->m_inoalign_mask) {
                offset_agbno = agbno & mp->m_inoalign_mask;
                chunk_agbno = agbno - offset_agbno;
        } else {
+                xfs_btree_cur_t *cur;   /* inode btree cursor */
+                xfs_agino_t     chunk_agino; /* first agino in inode chunk */
+                __int32_t       chunk_cnt; /* count of free inodes in chunk */
+                xfs_inofree_t   chunk_free; /* mask of free inodes in chunk */
+                xfs_buf_t       *agbp;  /* agi buffer */
+                int             i;      /* temp state */
                down_read(&mp->m_peraglock);
                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
                up_read(&mp->m_peraglock);
                if (error) {
-#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
                                        "xfs_ialloc_read_agi() returned "
                                        "error %d, agno %d",
                                        error, agno);
-#endif /* DEBUG */
                        return error;
                }
-                cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
-                        (xfs_inode_t *)0, 0);
+                cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+                error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
-#ifdef DEBUG
+                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_lookup_le() failed");
-#endif /* DEBUG */
                        goto error0;
                }
-                if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-                                &chunk_free, &i))) {
+                error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-#ifdef DEBUG
+                                &chunk_free, &i);
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                if (error) {
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
                        goto error0;
                }
                if (i == 0) {
 #ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
 #endif /* DEBUG */
                        error = XFS_ERROR(EINVAL);
                }
+ error0:
                xfs_trans_brelse(tp, agbp);
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                if (error)
@@ -1290,19 +1344,35 @@ xfs_dilocate(
                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
                offset_agbno = agbno - chunk_agbno;
        }
        ASSERT(agbno >= chunk_agbno);
        cluster_agbno = chunk_agbno +
                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
                XFS_INO_TO_OFFSET(mp, ino);
-        *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
-        *off = offset;
+        imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
-        *len = blks_per_cluster;
+        imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+        imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+        /*
+         * If the inode number maps to a block outside the bounds
+         * of the file system then return NULL rather than calling
+         * read_buf and panicing when we get an error from the
+         * driver.
+         */
+        if ((imap->im_blkno + imap->im_len) >
+            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        (unsigned long long) imap->im_blkno,
+                        (unsigned long long) imap->im_len,
+                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+                return XFS_ERROR(EINVAL);
+        }
        return 0;
-error0:
-        xfs_trans_brelse(tp, agbp);
-        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-        return error;
 }
 /*
@@ -1370,70 +1440,95 @@ xfs_ialloc_log_agi(
        xfs_trans_log_buf(tp, bp, first, last);
 }
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+        struct xfs_agi          *agi)
+{
+        int                     i;
+        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+                ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
 /*
 * Read in the allocation group header (inode allocation section)
 */
 int
-xfs_ialloc_read_agi(
+xfs_read_agi(
-        xfs_mount_t     *mp,            /* file system mount structure */
+        struct xfs_mount        *mp,    /* file system mount structure */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,   /* allocation group number */
-        xfs_buf_t       **bpp)          /* allocation group hdr buf */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
 {
-        xfs_agi_t       *agi;           /* allocation group header */
+        struct xfs_agi          *agi;   /* allocation group header */
-        int             agi_ok;         /* agi is consistent */
+        int                     agi_ok; /* agi is consistent */
-        xfs_buf_t       *bp;            /* allocation group hdr buf */
+        int                     error;
-        xfs_perag_t     *pag;           /* per allocation group data */
-        int             error;
        ASSERT(agno != NULLAGNUMBER);
-        error = xfs_trans_read_buf(
-                        mp, tp, mp->m_ddev_targp,
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                        XFS_FSS_TO_BB(mp, 1), 0, bpp);
        if (error)
                return error;
-        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+        ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+        agi = XFS_BUF_TO_AGI(*bpp);
        /*
         * Validate the magic number of the agi block.
         */
-        agi = XFS_BUF_TO_AGI(bp);
+        agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-        agi_ok =
+                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
+                be32_to_cpu(agi->agi_seqno) == agno;
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
                        XFS_RANDOM_IALLOC_READ_AGI))) {
-                XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
                                     mp, agi);
-                xfs_trans_brelse(tp, bp);
+                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+        xfs_check_agi_unlinked(agi);
+        return 0;
+}
+int
+xfs_ialloc_read_agi(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+        struct xfs_agi          *agi;   /* allocation group header */
+        struct xfs_perag        *pag;   /* per allocation group data */
+        int                     error;
+        error = xfs_read_agi(mp, tp, agno, bpp);
+        if (error)
+                return error;
+        agi = XFS_BUF_TO_AGI(*bpp);
        pag = &mp->m_perag[agno];
        if (!pag->pagi_init) {
                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
                pag->pagi_count = be32_to_cpu(agi->agi_count);
                pag->pagi_init = 1;
-        } else {
-                /*
-                 * It's possible for these to be out of sync if
-                 * we are in the middle of a forced shutdown.
-                 */
-                ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-                        XFS_FORCED_SHUTDOWN(mp));
        }
-#ifdef DEBUG
+        /*
-        {
+         * It's possible for these to be out of sync if
-                int     i;
+         * we are in the middle of a forced shutdown.
+         */
-                for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-                        ASSERT(agi->agi_unlinked[i]);
+                XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
-        *bpp = bp;
        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 4e30ec1d13b..50f558a4e0a 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -20,6 +20,7 @@
 struct xfs_buf;
 struct xfs_dinode;
+struct xfs_imap;
 struct xfs_mount;
 struct xfs_trans;
@@ -56,7 +57,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 }
-#ifdef __KERNEL__
 /*
 * Allocate an inode on disk.
 * Mode is used to tell whether the new inode will need space, and whether
@@ -105,17 +105,14 @@ xfs_difree(
        xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
 /*
- * Return the location of the inode in bno/len/off,
+ * Return the location of the inode in imap, for mapping it into a buffer.
- * for mapping it into a buffer.
 */
 int
-xfs_dilocate(
+xfs_imap(
        struct xfs_mount *mp,           /* file system mount structure */
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_ino_t       ino,            /* inode to locate */
-        xfs_fsblock_t   *bno,           /* output: block containing inode */
+        struct xfs_imap *imap,          /* location map structure */
-        int             *len,           /* output: num blocks in cluster*/
-        int             *off,           /* output: index in block of inode */
        uint            flags);         /* flags for inode btree lookup */
 /*
@@ -154,6 +151,24 @@ xfs_ialloc_pagi_init(
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_agnumber_t  agno);          /* allocation group number */
-#endif  /* __KERNEL__ */
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+                __int32_t fcnt, xfs_inofree_t free, int *stat);
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+                __int32_t fcnt, xfs_inofree_t free, int *stat);
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 83502f3edef..99f2408e8d8 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -35,2044 +35,349 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
-STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-                xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
-/*
+STATIC int
- * Single level of the xfs_inobt_delete record deletion routine.
+xfs_inobt_get_minrecs(
- * Delete record pointed to by cur/level.
+        struct xfs_btree_cur    *cur,
- * Remove the record from its block then rebalance the tree.
+        int                     level)
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                              /* error */
-xfs_inobt_delrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level removing record from */
-        int                     *stat)  /* fail/done/go-on */
 {
-        xfs_buf_t               *agbp;  /* buffer for a.g. inode header */
+        return cur->bc_mp->m_inobt_mnr[level != 0];
-        xfs_mount_t             *mp;    /* mount structure */
+}
-        xfs_agi_t               *agi;   /* allocation group inode header */
-        xfs_inobt_block_t       *block; /* btree block record/key lives in */
-        xfs_agblock_t           bno;    /* btree block number */
-        xfs_buf_t               *bp;    /* buffer for block */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* kp points here if block is level 0 */
-        xfs_inobt_key_t         *kp = NULL;     /* pointer to btree keys */
-        xfs_agblock_t           lbno;   /* left block's block number */
-        xfs_buf_t               *lbp;   /* left block's buffer pointer */
-        xfs_inobt_block_t       *left;  /* left btree block */
-        xfs_inobt_key_t         *lkp;   /* left block key pointer */
-        xfs_inobt_ptr_t         *lpp;   /* left block address pointer */
-        int                     lrecs = 0;      /* number of records in left block */
-        xfs_inobt_rec_t         *lrp;   /* left block record pointer */
-        xfs_inobt_ptr_t         *pp = NULL;     /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_agblock_t           rbno;   /* right block's block number */
-        xfs_buf_t               *rbp;   /* right block's buffer pointer */
-        xfs_inobt_block_t       *right; /* right btree block */
-        xfs_inobt_key_t         *rkp;   /* right block key pointer */
-        xfs_inobt_rec_t         *rp;    /* pointer to btree records */
-        xfs_inobt_ptr_t         *rpp;   /* right block address pointer */
-        int                     rrecs = 0;      /* number of records in right block */
-        int                     numrecs;
-        xfs_inobt_rec_t         *rrp;   /* right block record pointer */
-        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-        mp = cur->bc_mp;
-        /*
-         * Get the index of the entry being deleted, check for nothing there.
-         */
-        ptr = cur->bc_ptrs[level];
-        if (ptr == 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Get the buffer & block containing the record or key/ptr.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Fail if we're off the end of the block.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
+STATIC struct xfs_btree_cur *
-        if (ptr > numrecs) {
+xfs_inobt_dup_cursor(
-                *stat = 0;
+        struct xfs_btree_cur    *cur)
-                return 0;
+{
-        }
+        return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
-        /*
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno);
-         * It's a nonleaf.  Excise the key and ptr being deleted, by
+}
-         * sliding the entries past them down one.
-         * Log the changed areas of the block.
-         */
-        if (level > 0) {
-                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
-                                return error;
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&kp[ptr - 1], &kp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        memmove(&pp[ptr - 1], &pp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
-                        xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                }
-        }
-        /*
-         * It's a leaf.  Excise the record being deleted, by sliding the
-         * entries past it down one.  Log the changed areas of the block.
-         */
-        else {
-                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&rp[ptr - 1], &rp[ptr],
-                                (numrecs - ptr) * sizeof(*rp));
-                        xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                /*
-                 * If it's the first record in the block, we'll need a key
-                 * structure to pass up to the next level (updkey).
-                 */
-                if (ptr == 1) {
-                        key.ir_startino = rp->ir_startino;
-                        kp = &key;
-                }
-        }
-        /*
-         * Decrement and log the number of entries in the block.
-         */
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * Is this the root level?  If so, we're almost done.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                /*
-                 * If this is the root level,
-                 * and there's only one entry left,
-                 * and it's NOT the leaf level,
-                 * then we can get rid of this level.
-                 */
-                if (numrecs == 1 && level > 0) {
-                        agbp = cur->bc_private.a.agbp;
-                        agi = XFS_BUF_TO_AGI(agbp);
-                        /*
-                         * pp is still set to the first pointer in the block.
-                         * Make it the new root of the btree.
-                         */
-                        bno = be32_to_cpu(agi->agi_root);
-                        agi->agi_root = *pp;
-                        be32_add_cpu(&agi->agi_level, -1);
-                        /*
-                         * Free the block.
-                         */
-                        if ((error = xfs_free_extent(cur->bc_tp,
-                                XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
-                                return error;
-                        xfs_trans_binval(cur->bc_tp, bp);
-                        xfs_ialloc_log_agi(cur->bc_tp, agbp,
-                                XFS_AGI_ROOT | XFS_AGI_LEVEL);
-                        /*
-                         * Update the cursor so there's one fewer level.
-                         */
-                        cur->bc_bufs[level] = NULL;
-                        cur->bc_nlevels--;
-                } else if (level > 0 &&
-                           (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we deleted the leftmost entry in the block, update the
-         * key values above us in the tree.
-         */
-        if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
-                return error;
-        /*
-         * If the number of records remaining in the block is at least
-         * the minimum, we're done.
-         */
-        if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                if (level > 0 &&
-                    (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Otherwise, we have to move some records around to keep the
-         * tree balanced.  Look at the left and right sibling blocks to
-         * see if we can re-balance by moving only one record.
-         */
-        rbno = be32_to_cpu(block->bb_rightsib);
-        lbno = be32_to_cpu(block->bb_leftsib);
-        bno = NULLAGBLOCK;
-        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-        /*
-         * Duplicate the cursor so our btree manipulations here won't
-         * disrupt the next level up.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        /*
-         * If there's a right sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (rbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the last entry in the next block.
-                 * Actually any entry but the first would suffice.
-                 */
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_inobt_increment(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(right->bb_leftsib);
-                /*
-                 * If right block is full enough so that removing one entry
-                 * won't make it too empty, and left-shifting an entry out
-                 * of right to us works, we're done.
-                 */
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_inobt_lshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level > 0 &&
-                                    (error = xfs_inobt_decrement(cur, level,
-                                                &i)))
-                                        return error;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference, and fix up the temp cursor to point
-                 * to our block again (last record).
-                 */
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLAGBLOCK) {
-                        xfs_btree_firstrec(tcur, level);
-                        if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                                goto error0;
-                }
-        }
-        /*
-         * If there's a left sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (lbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the first entry in the
-                 * previous block.
-                 */
-                xfs_btree_firstrec(tcur, level);
-                if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                        goto error0;
-                xfs_btree_firstrec(tcur, level);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(left->bb_rightsib);
-                /*
-                 * If left block is full enough so that removing one entry
-                 * won't make it too empty, and right-shifting an entry out
-                 * of left to us works, we're done.
-                 */
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_inobt_rshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference.
-                 */
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * Delete the temp cursor, we're done with it.
-         */
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        /*
-         * If here, we need to do a join to keep the tree balanced.
-         */
-        ASSERT(bno != NULLAGBLOCK);
-        /*
-         * See if we can join with the left neighbor block.
-         */
-        if (lbno != NULLAGBLOCK &&
-            lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "right" to be the starting block,
-                 * "left" to be the left neighbor.
-                 */
-                rbno = bno;
-                right = block;
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                rbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        return error;
-        }
-        /*
-         * If that won't work, see if we can join with the right neighbor block.
-         */
-        else if (rbno != NULLAGBLOCK &&
-                 rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "left" to be the starting block,
-                 * "right" to be the right neighbor.
-                 */
-                lbno = bno;
-                left = block;
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                lbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        return error;
-        }
-        /*
-         * Otherwise, we can't fix the imbalance.
-         * Just return.  This is probably a logic error, but it's not fatal.
-         */
-        else {
-                if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * We're now going to join "left" and "right" by moving all the stuff
-         * in "right" to "left" and deleting "right".
-         */
-        if (level > 0) {
-                /*
-                 * It's a non-leaf.  Move keys and pointers.
-                 */
-                lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-                xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-                xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        } else {
-                /*
-                 * It's a leaf.  Move records.
-                 */
-                lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-                xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        }
-        /*
-         * If we joined with the left neighbor, set the buffer in the
-         * cursor to the left block, and fix up the index.
-         */
-        if (bp != lbp) {
-                xfs_btree_setbuf(cur, level, lbp);
-                cur->bc_ptrs[level] += lrecs;
-        }
-        /*
-         * If we joined with the right neighbor and there's a level above
-         * us, increment the cursor at that level.
-         */
-        else if (level + 1 < cur->bc_nlevels &&
-                 (error = xfs_alloc_increment(cur, level + 1, &i)))
-                return error;
-        /*
-         * Fix up the number of records in the surviving block.
-         */
-        lrecs += rrecs;
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        /*
-         * Fix up the right block pointer in the surviving block, and log it.
-         */
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there is a right sibling now, make it point to the
-         * remaining block.
-         */
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                xfs_inobt_block_t       *rrblock;
-                xfs_buf_t               *rrbp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+STATIC void
-                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
+xfs_inobt_set_root(
-                                &rrbp, XFS_INO_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        union xfs_btree_ptr     *nptr,
-                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+        int                     inc)    /* level change */
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+{
-                        return error;
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-                rrblock->bb_leftsib = cpu_to_be32(lbno);
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-                xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-        }
-        /*
-         * Free the deleting block.
-         */
-        if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                     cur->bc_private.a.agno, rbno), 1)))
-                return error;
-        xfs_trans_binval(cur->bc_tp, rbp);
-        /*
-         * Readjust the ptr at this level if it's not a leaf, since it's
-         * still pointing at the deletion point, which makes the cursor
-         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
-         * We can't use decrement because it would change the next level up.
-         */
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        /*
-         * Return value means the next level up has something to do.
-         */
-        *stat = 2;
-        return 0;
-error0:
+        agi->agi_root = nptr->s;
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        be32_add_cpu(&agi->agi_level, inc);
-        return error;
+        xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
 }
-/*
+STATIC int
- * Insert one record/level.  Return information to the caller
+xfs_inobt_alloc_block(
- * allowing the next level up to proceed if necessary.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_ptr     *start,
-STATIC int                              /* error */
+        union xfs_btree_ptr     *new,
-xfs_inobt_insrec(
+        int                     length,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *stat)
-        int                     level,  /* level to insert record at */
-        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
-        xfs_inobt_rec_t         *recp,  /* i/o: record data inserted */
-        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
-        int                     *stat)  /* success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block record/key lives in */
+        xfs_alloc_arg_t         args;           /* block allocation args */
-        xfs_buf_t               *bp;    /* buffer for block */
+        int                     error;          /* error return value */
-        int                     error;  /* error return value */
+        xfs_agblock_t           sbno = be32_to_cpu(start->s);
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* key value being inserted */
-        xfs_inobt_key_t         *kp=NULL;       /* pointer to btree keys */
-        xfs_agblock_t           nbno;   /* block number of allocated block */
-        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-        xfs_inobt_key_t         nkey;   /* new key value, from split */
-        xfs_inobt_rec_t         nrec;   /* new record value, for caller */
-        int                     numrecs;
-        int                     optr;   /* old ptr value */
-        xfs_inobt_ptr_t         *pp;    /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_inobt_rec_t         *rp=NULL;       /* pointer to btree records */
-        /*
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-         * GCC doesn't understand the (arguably complex) control flow in
-         * this function and complains about uninitialized structure fields
-         * without this.
-         */
-        memset(&nrec, 0, sizeof(nrec));
-        /*
+        memset(&args, 0, sizeof(args));
-         * If we made it to the root level, allocate a new root block
+        args.tp = cur->bc_tp;
-         * and we're done.
+        args.mp = cur->bc_mp;
-         */
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
-        if (level >= cur->bc_nlevels) {
+        args.minlen = 1;
-                error = xfs_inobt_newroot(cur, &i);
+        args.maxlen = 1;
-                *bnop = NULLAGBLOCK;
+        args.prod = 1;
-                *stat = i;
+        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        error = xfs_alloc_vextent(&args);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
        }
-        /*
+        if (args.fsbno == NULLFSBLOCK) {
-         * Make a key out of the record data to be inserted, and save it.
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-         */
-        key.ir_startino = recp->ir_startino;
-        optr = ptr = cur->bc_ptrs[level];
-        /*
-         * If we're off the left edge, return failure.
-         */
-        if (ptr == 0) {
                *stat = 0;
                return 0;
        }
-        /*
+        ASSERT(args.len == 1);
-         * Get pointers to the btree buffer and block.
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-         */
-        bp = cur->bc_bufs[level];
+        new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-        /*
-         * Check that the new entry is being inserted in the right place.
-         */
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-                } else {
-                        kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLAGBLOCK;
-        ncur = NULL;
-        /*
-         * If the block is full, we can't insert the new entry until we
-         * make the block un-full.
-         */
-        if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * First, try shifting an entry to the right neighbor.
-                 */
-                if ((error = xfs_inobt_rshift(cur, level, &i)))
-                        return error;
-                if (i) {
-                        /* nothing */
-                }
-                /*
-                 * Next, try shifting an entry to the left neighbor.
-                 */
-                else {
-                        if ((error = xfs_inobt_lshift(cur, level, &i)))
-                                return error;
-                        if (i) {
-                                optr = ptr = cur->bc_ptrs[level];
-                        } else {
-                                /*
-                                 * Next, try splitting the current block
-                                 * in half. If this works we have to
-                                 * re-set our variables because
-                                 * we could be in a different block now.
-                                 */
-                                if ((error = xfs_inobt_split(cur, level, &nbno,
-                                                &nkey, &ncur, &i)))
-                                        return error;
-                                if (i) {
-                                        bp = cur->bc_bufs[level];
-                                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                                        if ((error = xfs_btree_check_sblock(cur,
-                                                        block, level, bp)))
-                                                return error;
-#endif
-                                        ptr = cur->bc_ptrs[level];
-                                        nrec.ir_startino = nkey.ir_startino;
-                                } else {
-                                        /*
-                                         * Otherwise the insert fails.
-                                         */
-                                        *stat = 0;
-                                        return 0;
-                                }
-                        }
-                }
-        }
-        /*
-         * At this point we know there's room for our new entry in the block
-         * we're pointing at.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                /*
-                 * It's a non-leaf entry.  Make a hole for the new data
-                 * in the key and ptr regions of the block.
-                 */
-                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                                return error;
-                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-                /*
-                 * Now stuff the new data in, bump numrecs and log the new data.
-                 */
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                        return error;
-#endif
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be32(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_inobt_log_keys(cur, bp, ptr, numrecs);
-                xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
-        } else {
-                /*
-                 * It's a leaf entry.  Make a hole for the new record.
-                 */
-                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                /*
-                 * Now stuff the new record in, bump numrecs
-                 * and log the new data.
-                 */
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_inobt_log_recs(cur, bp, ptr, numrecs);
-        }
-        /*
-         * Log the new number of records in the btree header.
-         */
-        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        /*
-         * Check that the key/record is in the right place, now.
-         */
-        if (ptr < numrecs) {
-                if (level == 0)
-                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                                rp + ptr);
-                else
-                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                                kp + ptr);
-        }
-#endif
-        /*
-         * If we inserted at the start of a block, update the parents' keys.
-         */
-        if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
-                return error;
-        /*
-         * Return the new block number, if any.
-         * If there is one, give back a record value and a cursor too.
-         */
-        *bnop = nbno;
-        if (nbno != NULLAGBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
        *stat = 1;
        return 0;
 }
-/*
+STATIC int
- * Log header fields from a btree block.
+xfs_inobt_free_block(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        struct xfs_buf          *bp)
-xfs_inobt_log_block(
-        xfs_trans_t             *tp,    /* transaction pointer */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     fields) /* mask of fields: XFS_BB_... */
 {
-        int                     first;  /* first byte offset logged */
+        xfs_fsblock_t           fsbno;
-        int                     last;   /* last byte offset logged */
+        int                     error;
-        static const short      offsets[] = {   /* table of offsets */
-                offsetof(xfs_inobt_block_t, bb_magic),
-                offsetof(xfs_inobt_block_t, bb_level),
-                offsetof(xfs_inobt_block_t, bb_numrecs),
-                offsetof(xfs_inobt_block_t, bb_leftsib),
-                offsetof(xfs_inobt_block_t, bb_rightsib),
-                sizeof(xfs_inobt_block_t)
-        };
-        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+        fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
-        xfs_trans_log_buf(tp, bp, first, last);
+        error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+        if (error)
+                return error;
+        xfs_trans_binval(cur->bc_tp, bp);
+        return error;
 }
-/*
+STATIC int
- * Log keys from a btree block (nonleaf).
+xfs_inobt_get_maxrecs(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        int                     level)
-xfs_inobt_log_keys(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     kfirst, /* index of first key to log */
-        int                     klast)  /* index of last key to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        return cur->bc_mp->m_inobt_mxr[level != 0];
-        int                     first;  /* first byte offset logged */
-        xfs_inobt_key_t         *kp;    /* key pointer in btree block */
-        int                     last;   /* last byte offset logged */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_inobt_log_ptrs(
+xfs_inobt_init_key_from_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     pfirst, /* index of first pointer to log */
-        int                     plast)  /* index of last pointer to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        key->inobt.ir_startino = rec->inobt.ir_startino;
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_inobt_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_inobt_log_recs(
+xfs_inobt_init_rec_from_key(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     rfirst, /* index of first record to log */
-        int                     rlast)  /* index of last record to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        rec->inobt.ir_startino = key->inobt.ir_startino;
-        int                     first;  /* first byte offset logged */
+}
-        int                     last;   /* last byte offset logged */
-        xfs_inobt_rec_t         *rp;    /* record pointer for btree block */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+STATIC void
-        rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+xfs_inobt_init_rec_from_cur(
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+        struct xfs_btree_cur    *cur,
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+        union xfs_btree_rec     *rec)
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+{
+        rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+        rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+        rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
 }
 /*
- * Lookup the record.  The cursor is made to point to it, based on dir.
+ * intial value of ptr for lookup
- * Return 0 if can't find any such record, 1 for success.
 */
-STATIC int                              /* error */
+STATIC void
-xfs_inobt_lookup(
+xfs_inobt_init_ptr_from_cur(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        union xfs_btree_ptr     *ptr)
-        int                     *stat)  /* success/failure */
 {
-        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_inobt_block_t       *block=NULL;    /* current btree block */
-        __int64_t               diff;   /* difference for the current key */
-        int                     error;  /* error return value */
-        int                     keyno=0;        /* current key number */
-        int                     level;  /* level in the btree */
-        xfs_mount_t             *mp;    /* file system mount point */
-        /*
-         * Get the allocation group header, and the root block number.
-         */
-        mp = cur->bc_mp;
-        {
-                xfs_agi_t       *agi;   /* a.g. inode header */
-                agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-                agno = be32_to_cpu(agi->agi_seqno);
-                agbno = be32_to_cpu(agi->agi_root);
-        }
-        /*
-         * Iterate over each level in the btree, starting at the root.
-         * For each level above the leaves, find the key we need, based
-         * on the lookup record, then follow the corresponding block
-         * pointer down to the next level.
-         */
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                xfs_buf_t       *bp;    /* buffer pointer for btree block */
-                xfs_daddr_t     d;      /* disk address of btree block */
-                /*
-                 * Get the disk address we're looking for.
-                 */
-                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                /*
-                 * If the old buffer at this level is for a different block,
-                 * throw it away, otherwise just use it.
-                 */
-                bp = cur->bc_bufs[level];
-                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = NULL;
-                if (!bp) {
-                        /*
-                         * Need to get a new buffer.  Read it, then
-                         * set it in the cursor, releasing the old one.
-                         */
-                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                        agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
-                                return error;
-                        xfs_btree_setbuf(cur, level, bp);
-                        /*
-                         * Point to the btree block, now that we have the buffer
-                         */
-                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                        if ((error = xfs_btree_check_sblock(cur, block, level,
-                                        bp)))
-                                return error;
-                } else
-                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                /*
-                 * If we already had a key match at a higher level, we know
-                 * we need to use the first entry in this block.
-                 */
-                if (diff == 0)
-                        keyno = 1;
-                /*
-                 * Otherwise we need to search this block.  Do a binary search.
-                 */
-                else {
-                        int             high;   /* high entry number */
-                        xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
-                        xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
-                        int             low;    /* low entry number */
-                        /*
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
-                         * Get a pointer to keys or records.
-                         */
-                        if (level > 0)
-                                kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                        else
-                                krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
-                        /*
-                         * Set low and high entry numbers, 1-based.
-                         */
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                /*
-                                 * If the block is empty, the tree must
-                                 * be an empty leaf.
-                                 */
-                                ASSERT(level == 0 && cur->bc_nlevels == 1);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                *stat = 0;
-                                return 0;
-                        }
-                        /*
-                         * Binary search the block.
-                         */
-                        while (low <= high) {
-                                xfs_agino_t     startino;       /* key value */
-                                /*
-                                 * keyno is average of low and high.
-                                 */
-                                keyno = (low + high) >> 1;
-                                /*
-                                 * Get startino.
-                                 */
-                                if (level > 0) {
-                                        xfs_inobt_key_t *kkp;
-                                        kkp = kkbase + keyno - 1;
-                                        startino = be32_to_cpu(kkp->ir_startino);
-                                } else {
-                                        xfs_inobt_rec_t *krp;
-                                        krp = krbase + keyno - 1;
-                                        startino = be32_to_cpu(krp->ir_startino);
-                                }
-                                /*
-                                 * Compute difference to get next direction.
-                                 */
-                                diff = (__int64_t)
-                                        startino - cur->bc_rec.i.ir_startino;
-                                /*
-                                 * Less than, move right.
-                                 */
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                /*
-                                 * Greater than, move left.
-                                 */
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                /*
-                                 * Equal, we're done.
-                                 */
-                                else
-                                        break;
-                        }
-                }
-                /*
-                 * If there are more levels, set up for the next level
-                 * by getting the block number and filling in the cursor.
-                 */
-                if (level > 0) {
-                        /*
-                         * If we moved left, need the previous key number,
-                         * unless there isn't one.
-                         */
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                                return error;
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        /*
-         * Done with the search.
-         * See if we need to adjust the results.
-         */
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE &&
-                    keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                        int     i;
-                        cur->bc_ptrs[0] = keyno;
+        ptr->s = agi->agi_root;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
-                                return error;
-                        ASSERT(i == 1);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        /*
-         * Return if we succeeded or not.
-         */
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-                *stat = 0;
-        else
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        return 0;
 }
-/*
+STATIC __int64_t
- * Move 1 record left from cur/level if possible.
+xfs_inobt_key_diff(
- * Update cur to reflect the new path.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key)
-STATIC int                              /* error */
-xfs_inobt_lshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
-#ifdef DEBUG
+                          cur->bc_rec.i.ir_startino;
-        int                     i;      /* loop index */
-#endif
-        xfs_inobt_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-        xfs_inobt_block_t       *left;  /* left neighbor btree block */
-        xfs_inobt_key_t         *lkp=NULL;      /* key pointer for left block */
-        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-        xfs_inobt_rec_t         *lrp=NULL;      /* record pointer for left block */
-        int                     nrec;   /* new number of left block entries */
-        xfs_buf_t               *rbp;   /* buffer for right (current) block */
-        xfs_inobt_block_t       *right; /* right (current) btree block */
-        xfs_inobt_key_t         *rkp=NULL;      /* key pointer for right block */
-        xfs_inobt_ptr_t         *rpp=NULL;      /* address pointer for right block */
-        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-        /*
-         * Set up variables for this block as "right".
-         */
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-#endif
-        /*
-         * If we've got no left sibling then we can't shift an entry left.
-         */
-        if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] <= 1) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the left neighbor as "left".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                        0, &lbp, XFS_INO_BTREE_REF)))
-                return error;
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        nrec = be16_to_cpu(left->bb_numrecs) + 1;
-        /*
-         * If non-leaf, copy a key and a ptr to the left block.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                *lkp = *rkp;
-                xfs_inobt_log_keys(cur, lbp, nrec, nrec);
-                lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                        return error;
-#endif
-                *lpp = *rpp;
-                xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
-        }
-        /*
-         * If leaf, copy a record to the left block.
-         */
-        else {
-                lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_inobt_log_recs(cur, lbp, nrec, nrec);
-        }
-        /*
-         * Bump and log left's numrecs, decrement and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, 1);
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-        else
-                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-#endif
-        be16_add_cpu(&right->bb_numrecs, -1);
-        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Slide the contents of right down one entry.
-         */
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                        level)))
-                                return error;
-                }
-#endif
-                memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-        } else {
-                memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                key.ir_startino = rrp->ir_startino;
-                rkp = &key;
-        }
-        /*
-         * Update the parent key values of right.
-         */
-        if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
-                return error;
-        /*
-         * Slide the cursor value left one.
-         */
-        cur->bc_ptrs[level]--;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Allocate a new root block, fill it in.
+xfs_inobt_kill_root(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        struct xfs_buf          *bp,
-xfs_inobt_newroot(
+        int                     level,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_ptr     *newroot)
-        int                     *stat)  /* success/failure */
 {
-        xfs_agi_t               *agi;   /* a.g. inode header */
+        int                     error;
-        xfs_alloc_arg_t         args;   /* allocation argument structure */
-        xfs_inobt_block_t       *block; /* one half of the old root block */
-        xfs_buf_t               *bp;    /* buffer containing block */
-        int                     error;  /* error return value */
-        xfs_inobt_key_t         *kp;    /* btree key pointer */
-        xfs_agblock_t           lbno;   /* left block number */
-        xfs_buf_t               *lbp;   /* left buffer pointer */
-        xfs_inobt_block_t       *left;  /* left btree block */
-        xfs_buf_t               *nbp;   /* new (root) buffer */
-        xfs_inobt_block_t       *new;   /* new (root) btree block */
-        int                     nptr;   /* new value for key index, 1 or 2 */
-        xfs_inobt_ptr_t         *pp;    /* btree address pointer */
-        xfs_agblock_t           rbno;   /* right block number */
-        xfs_buf_t               *rbp;   /* right buffer pointer */
-        xfs_inobt_block_t       *right; /* right btree block */
-        xfs_inobt_rec_t         *rp;    /* btree record pointer */
-        ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, killroot);
        /*
-         * Get a block & a buffer.
+         * Update the root pointer, decreasing the level by 1 and then
+         * free the old root.
         */
-        agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+        xfs_inobt_set_root(cur, newroot, -1);
-        args.tp = cur->bc_tp;
+        error = xfs_inobt_free_block(cur, bp);
-        args.mp = cur->bc_mp;
+        if (error) {
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                be32_to_cpu(agi->agi_root));
-        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-                args.isfl = args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args)))
                return error;
-        /*
-         * None available, we fail.
-         */
-        if (args.fsbno == NULLFSBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-        new = XFS_BUF_TO_INOBT_BLOCK(nbp);
-        /*
-         * Set the root data in the a.g. inode structure.
-         */
-        agi->agi_root = cpu_to_be32(args.agbno);
-        be32_add_cpu(&agi->agi_level, 1);
-        xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
-                XFS_AGI_ROOT | XFS_AGI_LEVEL);
-        /*
-         * At the previous root level there are now two blocks: the old
-         * root, and the new block generated when it was split.
-         * We don't know which one the cursor is pointing at, so we
-         * set up variables "left" and "right" for each case.
-         */
-        bp = cur->bc_bufs[cur->bc_nlevels - 1];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
-                return error;
-#endif
-        if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                /*
-                 * Our block is left, pick up the right block.
-                 */
-                lbp = bp;
-                lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-                left = block;
-                rbno = be32_to_cpu(left->bb_rightsib);
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                rbno, 0, &rbp, XFS_INO_BTREE_REF)))
-                        return error;
-                bp = rbp;
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-                if ((error = xfs_btree_check_sblock(cur, right,
-                                cur->bc_nlevels - 1, rbp)))
-                        return error;
-                nptr = 1;
-        } else {
-                /*
-                 * Our block is right, pick up the left block.
-                 */
-                rbp = bp;
-                rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
-                right = block;
-                lbno = be32_to_cpu(right->bb_leftsib);
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                lbno, 0, &lbp, XFS_INO_BTREE_REF)))
-                        return error;
-                bp = lbp;
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-                if ((error = xfs_btree_check_sblock(cur, left,
-                                cur->bc_nlevels - 1, lbp)))
-                        return error;
-                nptr = 2;
-        }
-        /*
-         * Fill in the new block's btree header and log it.
-         */
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        new->bb_level = cpu_to_be16(cur->bc_nlevels);
-        new->bb_numrecs = cpu_to_be16(2);
-        new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-        new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-        xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
-        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-        /*
-         * Fill in the key data in the new root.
-         */
-        kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
-        if (be16_to_cpu(left->bb_level) > 0) {
-                kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
-                kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
-        } else {
-                rp = XFS_INOBT_REC_ADDR(left, 1, cur);
-                kp[0].ir_startino = rp->ir_startino;
-                rp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                kp[1].ir_startino = rp->ir_startino;
        }
-        xfs_inobt_log_keys(cur, nbp, 1, 2);
-        /*
-         * Fill in the pointer data in the new root.
-         */
-        pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
-        pp[0] = cpu_to_be32(lbno);
-        pp[1] = cpu_to_be32(rbno);
-        xfs_inobt_log_ptrs(cur, nbp, 1, 2);
-        /*
-         * Fix up the cursor.
-         */
-        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-        cur->bc_ptrs[cur->bc_nlevels] = nptr;
-        cur->bc_nlevels++;
-        *stat = 1;
-        return 0;
-}
-/*
+        XFS_BTREE_STATS_INC(cur, free);
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                              /* error */
-xfs_inobt_rshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
-{
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left (current) block */
-        xfs_inobt_block_t       *left;  /* left (current) btree block */
-        xfs_inobt_key_t         *lkp;   /* key pointer for left block */
-        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-        xfs_inobt_rec_t         *lrp;   /* record pointer for left block */
-        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-        xfs_inobt_block_t       *right; /* right neighbor btree block */
-        xfs_inobt_key_t         *rkp;   /* key pointer for right block */
-        xfs_inobt_ptr_t         *rpp;   /* address pointer for right block */
-        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-        xfs_btree_cur_t         *tcur;  /* temporary cursor */
-        /*
+        cur->bc_bufs[level] = NULL;
-         * Set up variables for this block as "left".
+        cur->bc_nlevels--;
-         */
-        lbp = cur->bc_bufs[level];
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * If we've got no right sibling then we can't shift an entry right.
-         */
-        if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the right neighbor as "right".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                        0, &rbp, XFS_INO_BTREE_REF)))
-                return error;
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Make a hole at the start of the right neighbor block, then
-         * copy the last left block entry to the hole.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-                        return error;
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-        } else {
-                lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.ir_startino = rrp->ir_startino;
-                rkp = &key;
-        }
-        /*
-         * Decrement and log left's numrecs, bump and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-        else
-                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-#endif
-        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Using a temporary cursor, update the parent key values of the
-         * block on the right.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        xfs_btree_lastrec(tcur, level);
-        if ((error = xfs_inobt_increment(tcur, level, &i)) ||
-            (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
-                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-                return error;
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        *stat = 1;
        return 0;
 }
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                              /* error */
-xfs_inobt_split(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to split */
-        xfs_agblock_t           *bnop,  /* output: block number allocated */
-        xfs_inobt_key_t         *keyp,  /* output: first key of new block */
-        xfs_btree_cur_t         **curp, /* output: new cursor */
-        int                     *stat)  /* success/failure */
-{
-        xfs_alloc_arg_t         args;   /* allocation argument structure */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index/record number */
-        xfs_agblock_t           lbno;   /* left (current) block number */
-        xfs_buf_t               *lbp;   /* buffer for left block */
-        xfs_inobt_block_t       *left;  /* left (current) btree block */
-        xfs_inobt_key_t         *lkp;   /* left btree key pointer */
-        xfs_inobt_ptr_t         *lpp;   /* left btree address pointer */
-        xfs_inobt_rec_t         *lrp;   /* left btree record pointer */
-        xfs_buf_t               *rbp;   /* buffer for right block */
-        xfs_inobt_block_t       *right; /* right (new) btree block */
-        xfs_inobt_key_t         *rkp;   /* right btree key pointer */
-        xfs_inobt_ptr_t         *rpp;   /* right btree address pointer */
-        xfs_inobt_rec_t         *rrp;   /* right btree record pointer */
-        /*
-         * Set up left block (current one).
-         */
-        lbp = cur->bc_bufs[level];
-        args.tp = cur->bc_tp;
-        args.mp = cur->bc_mp;
-        lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-        /*
-         * Allocate the new block.
-         * If we can't do it, we're toast.  Give up.
-         */
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
-        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-                args.isfl = args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args)))
-                return error;
-        if (args.fsbno == NULLFSBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-        /*
-         * Set up the new block as "right".
-         */
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-        /*
-         * "Left" is the current (according to the cursor) block.
-         */
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
 #ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+STATIC int
-                return error;
+xfs_inobt_keys_inorder(
-#endif
+        struct xfs_btree_cur    *cur,
-        /*
+        union xfs_btree_key     *k1,
-         * Fill in the btree header for the new block.
+        union xfs_btree_key     *k2)
-         */
+{
-        right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        return be32_to_cpu(k1->inobt.ir_startino) <
-        right->bb_level = left->bb_level;
+                be32_to_cpu(k2->inobt.ir_startino);
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        /*
-         * Make sure that if there's an odd number of entries now, that
-         * each new block will have the same number of entries.
-         */
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        /*
-         * For non-leaf blocks, copy keys and addresses over to the new block.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *keyp = *rkp;
-        }
-        /*
-         * For leaf blocks, copy records over to the new block.
-         */
-        else {
-                lrp = XFS_INOBT_REC_ADDR(left, i, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->ir_startino = rrp->ir_startino;
-        }
-        /*
-         * Find the left block number by looking in the buffer.
-         * Adjust numrecs, sibling pointers.
-         */
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be32(args.agbno);
-        right->bb_leftsib = cpu_to_be32(lbno);
-        xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
-        xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there's a block to the new block's right, make that block
-         * point back to right instead of to left.
-         */
-        if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-                xfs_inobt_block_t       *rrblock;       /* rr btree block */
-                xfs_buf_t               *rrbp;          /* buffer for rrblock */
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                be32_to_cpu(right->bb_rightsib), 0, &rrbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                        return error;
-                rrblock->bb_leftsib = cpu_to_be32(args.agbno);
-                xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
-        }
-        /*
-         * If the cursor is really in the right block, move it there.
-         * If it's just pointing past the last entry in left, then we'll
-         * insert there, so don't change anything in that case.
-         */
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-                xfs_btree_setbuf(cur, level, rbp);
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * If there are more levels, we'll need another cursor which refers
-         * the right block, no matter where this cursor was.
-         */
-        if (level + 1 < cur->bc_nlevels) {
-                if ((error = xfs_btree_dup_cursor(cur, curp)))
-                        return error;
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = args.agbno;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Update keys at all levels from here to the root along the cursor's path.
+xfs_inobt_recs_inorder(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        union xfs_btree_rec     *r1,
-xfs_inobt_updkey(
+        union xfs_btree_rec     *r2)
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_inobt_key_t         *keyp,  /* new key value to update to */
-        int                     level)  /* starting level for update */
 {
-        int                     ptr;    /* index of key in block */
+        return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+                be32_to_cpu(r2->inobt.ir_startino);
-        /*
-         * Go up the tree from this level toward the root.
-         * At each level, update the key value to the value input.
-         * Stop when we reach a level where the cursor isn't pointing
-         * at the first entry in the block.
-         */
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                xfs_buf_t               *bp;    /* buffer for block */
-                xfs_inobt_block_t       *block; /* btree block */
-#ifdef DEBUG
-                int                     error;  /* error return value */
-#endif
-                xfs_inobt_key_t         *kp;    /* ptr to btree block keys */
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                        return error;
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_inobt_log_keys(cur, bp, ptr, ptr);
-        }
-        return 0;
 }
+#endif  /* DEBUG */
-/*
+#ifdef XFS_BTREE_TRACE
- * Externally visible routines.
+ktrace_t        *xfs_inobt_trace_buf;
- */
-/*
+STATIC void
- * Decrement cursor by one record at the level.
+xfs_inobt_trace_enter(
- * For nonzero levels the leaf-ward information is untouched.
+        struct xfs_btree_cur    *cur,
- */
+        const char              *func,
-int                                     /* error */
+        char                    *s,
-xfs_inobt_decrement(
+        int                     type,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     line,
-        int                     level,  /* level in btree, 0 is leaf */
+        __psunsigned_t          a0,
-        int                     *stat)  /* success/failure */
+        __psunsigned_t          a1,
+        __psunsigned_t          a2,
+        __psunsigned_t          a3,
+        __psunsigned_t          a4,
+        __psunsigned_t          a5,
+        __psunsigned_t          a6,
+        __psunsigned_t          a7,
+        __psunsigned_t          a8,
+        __psunsigned_t          a9,
+        __psunsigned_t          a10)
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
-        int                     error;
+                (void *)func, (void *)s, NULL, (void *)cur,
-        int                     lev;    /* btree level */
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-        ASSERT(level < cur->bc_nlevels);
+                (void *)a8, (void *)a9, (void *)a10);
-        /*
-         * Read-ahead to the left at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        /*
-         * Decrement the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (--cur->bc_ptrs[level] > 0) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Get a pointer to the btree block.
-         */
-        block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level,
-                        cur->bc_bufs[level])))
-                return error;
-#endif
-        /*
-         * If we just went off the left edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree decrementing pointers.
-         * Stop when we don't go off the left edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                /*
-                 * Read-ahead the left block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                xfs_buf_t       *bp;    /* buffer containing btree block */
-                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Delete the record pointed to by cur.
+xfs_inobt_trace_cursor(
- * The cursor refers to the place where the record was (could be inserted)
+        struct xfs_btree_cur    *cur,
- * when the operation returns.
+        __uint32_t              *s0,
- */
+        __uint64_t              *l0,
-int                                     /* error */
+        __uint64_t              *l1)
-xfs_inobt_delete(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
 {
-        int             error;
+        *s0 = cur->bc_private.a.agno;
-        int             i;              /* result code */
+        *l0 = cur->bc_rec.i.ir_startino;
-        int             level;          /* btree level */
+        *l1 = cur->bc_rec.i.ir_free;
-        /*
-         * Go up the tree, starting at leaf level.
-         * If 2 is returned then a join was done; go to the next level.
-         * Otherwise we are done.
-         */
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_inobt_delrec(cur, level, &i)))
-                        return error;
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_inobt_decrement(cur, level, &i)))
-                                        return error;
-                                break;
-                        }
-                }
-        }
-        *stat = i;
-        return 0;
 }
+STATIC void
-/*
+xfs_inobt_trace_key(
- * Get the data from the pointed-to record.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key,
-int                                     /* error */
+        __uint64_t              *l0,
-xfs_inobt_get_rec(
+        __uint64_t              *l1)
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agino_t             *ino,   /* output: starting inode of chunk */
-        __int32_t               *fcnt,  /* output: number of free inodes */
-        xfs_inofree_t           *free,  /* output: free inode mask */
-        int                     *stat)  /* output: success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(key->inobt.ir_startino);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
+        *l1 = 0;
-#ifdef DEBUG
-        int                     error;  /* error return value */
-#endif
-        int                     ptr;    /* record number */
-        xfs_inobt_rec_t         *rec;   /* record data */
-        bp = cur->bc_bufs[0];
-        ptr = cur->bc_ptrs[0];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-                return error;
-#endif
-        /*
-         * Off the right end or left end, return failure.
-         */
-        if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Point to the record and extract its data.
-         */
-        rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
-        *ino = be32_to_cpu(rec->ir_startino);
-        *fcnt = be32_to_cpu(rec->ir_freecount);
-        *free = be64_to_cpu(rec->ir_free);
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Increment cursor by one record at the level.
+xfs_inobt_trace_record(
- * For nonzero levels the leaf-ward information is untouched.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_rec     *rec,
-int                                     /* error */
+        __uint64_t              *l0,
-xfs_inobt_increment(
+        __uint64_t              *l1,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        __uint64_t              *l2)
-        int                     level,  /* level in btree, 0 is leaf */
-        int                     *stat)  /* success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(rec->inobt.ir_startino);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
+        *l1 = be32_to_cpu(rec->inobt.ir_freecount);
-        int                     error;  /* error return value */
+        *l2 = be64_to_cpu(rec->inobt.ir_free);
-        int                     lev;    /* btree level */
+}
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_inobt_ops = {
+        .rec_len                = sizeof(xfs_inobt_rec_t),
+        .key_len                = sizeof(xfs_inobt_key_t),
+        .dup_cursor             = xfs_inobt_dup_cursor,
+        .set_root               = xfs_inobt_set_root,
+        .kill_root              = xfs_inobt_kill_root,
+        .alloc_block            = xfs_inobt_alloc_block,
+        .free_block             = xfs_inobt_free_block,
+        .get_minrecs            = xfs_inobt_get_minrecs,
+        .get_maxrecs            = xfs_inobt_get_maxrecs,
+        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+        .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
+        .key_diff               = xfs_inobt_key_diff,
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the right at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        /*
-         * Get a pointer to the btree block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Increment the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we just went off the right edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree incrementing pointers.
-         * Stop when we don't go off the right edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                bp = cur->bc_bufs[lev];
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
 #ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+        .keys_inorder           = xfs_inobt_keys_inorder,
-                        return error;
+        .recs_inorder           = xfs_inobt_recs_inorder,
 #endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                /*
-                 * Read-ahead the right block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
-             lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
+#ifdef XFS_BTREE_TRACE
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+        .trace_enter            = xfs_inobt_trace_enter,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
+        .trace_cursor           = xfs_inobt_trace_cursor,
-                                XFS_INO_BTREE_REF)))
+        .trace_key              = xfs_inobt_trace_key,
-                        return error;
+        .trace_record           = xfs_inobt_trace_record,
-                lev--;
+#endif
-                xfs_btree_setbuf(cur, lev, bp);
+};
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = 1;
-        }
-        *stat = 1;
-        return 0;
-}
 /*
- * Insert the current record at the point referenced by cur.
+ * Allocate a new inode btree cursor.
- * The cursor may be inconsistent on return if splits have been done.
 */
-int                                     /* error */
+struct xfs_btree_cur *                          /* new inode btree cursor */
-xfs_inobt_insert(
+xfs_inobt_init_cursor(
-        xfs_btree_cur_t *cur,           /* btree cursor */
+        struct xfs_mount        *mp,            /* file system mount point */
-        int             *stat)          /* success/failure */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_buf          *agbp,          /* buffer for agi structure */
+        xfs_agnumber_t          agno)           /* allocation group number */
 {
-        int             error;          /* error return value */
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-        int             i;              /* result value, 0 for failure */
+        struct xfs_btree_cur    *cur;
-        int             level;          /* current level number in btree */
-        xfs_agblock_t   nbno;           /* new block number (split result) */
-        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-        xfs_inobt_rec_t nrec;           /* record being inserted this level */
-        xfs_btree_cur_t *pcur;          /* previous level's cursor */
-        level = 0;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-        nbno = NULLAGBLOCK;
-        nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-        nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-        nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-        ncur = NULL;
-        pcur = cur;
-        /*
-         * Loop going up the tree, starting at the leaf level.
-         * Stop when we don't get a split block, that must mean that
-         * the insert is finished with this level.
-         */
-        do {
-                /*
-                 * Insert nrec/nbno into this level of the tree.
-                 * Note if we fail, nbno will be null.
-                 */
-                if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        return error;
-                }
-                /*
-                 * See if the cursor we just used is trash.
-                 * Can't trash the caller's cursor, but otherwise we should
-                 * if ncur is a new cursor or we're about to be done.
-                 */
-                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                /*
-                 * If we got a new cursor, switch to it.
-                 */
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLAGBLOCK);
-        *stat = i;
-        return 0;
-}
-/*
+        cur->bc_tp = tp;
- * Lookup the record equal to ino in the btree given by cur.
+        cur->bc_mp = mp;
- */
+        cur->bc_nlevels = be32_to_cpu(agi->agi_level);
-int                                     /* error */
+        cur->bc_btnum = XFS_BTNUM_INO;
-xfs_inobt_lookup_eq(
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
+        cur->bc_ops = &xfs_inobt_ops;
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_inobt_lookup_ge(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
+        cur->bc_private.a.agbp = agbp;
- * Lookup the first record less than or equal to ino
+        cur->bc_private.a.agno = agno;
- * in the btree given by cur.
- */
+        return cur;
-int                                     /* error */
-xfs_inobt_lookup_le(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
 }
 /*
- * Update the record referred to by cur, to the value given
+ * Calculate number of records in an inobt btree block.
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
 */
-int                                     /* error */
+int
-xfs_inobt_update(
+xfs_inobt_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,
-        xfs_agino_t             ino,    /* starting inode of chunk */
+        int                     blocklen,
-        __int32_t               fcnt,   /* free inode count */
+        int                     leaf)
-        xfs_inofree_t           free)   /* free inode mask */
 {
-        xfs_inobt_block_t       *block; /* btree block to update */
+        blocklen -= XFS_INOBT_BLOCK_LEN(mp);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
-        int                     error;  /* error return value */
-        int                     ptr;    /* current record number (updating) */
-        xfs_inobt_rec_t         *rp;    /* pointer to updated record */
-        /*
+        if (leaf)
-         * Pick up the current block.
+                return blocklen / sizeof(xfs_inobt_rec_t);
-         */
+        return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
-        bp = cur->bc_bufs[0];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-                return error;
-#endif
-        /*
-         * Get the address of the rec to be updated.
-         */
-        ptr = cur->bc_ptrs[0];
-        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-        /*
-         * Fill in the new contents and log them.
-         */
-        rp->ir_startino = cpu_to_be32(ino);
-        rp->ir_freecount = cpu_to_be32(fcnt);
-        rp->ir_free = cpu_to_be64(free);
-        xfs_inobt_log_recs(cur, bp, ptr, ptr);
-        /*
-         * Updating first record in leaf. Pass new key value up to our parent.
-         */
-        if (ptr == 1) {
-                xfs_inobt_key_t key;    /* key containing [ino] */
-                key.ir_startino = cpu_to_be32(ino);
-                if ((error = xfs_inobt_updkey(cur, &key, 1)))
-                        return error;
-        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 8efc4a5b8b9..37e5dd01a57 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -24,7 +24,6 @@
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 /*
@@ -70,11 +69,6 @@ typedef struct xfs_inobt_key {
 /* btree pointer type */
 typedef __be32 xfs_inobt_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_inobt_block_t;
-#define XFS_BUF_TO_INOBT_BLOCK(bp)      ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
 /*
 * Bit manipulations for ir_free.
 */
@@ -85,14 +79,6 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
 /*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
-#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
-#define XFS_INOBT_IS_LAST_REC(cur)      \
-        ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
-/*
 * Maximum number of inode btree levels.
 */
 #define XFS_IN_MAXLEVELS(mp)            ((mp)->m_in_maxlevels)
@@ -104,75 +90,38 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define XFS_PREALLOC_BLOCKS(mp)         ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
 /*
- * Record, key, and pointer address macros for btree blocks.
+ * Btree block header size depends on a superblock flag.
- */
+ *
-#define XFS_INOBT_REC_ADDR(bb,i,cur) \
+ * (not quite yet, but soon)
-        (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i))
-#define XFS_INOBT_KEY_ADDR(bb,i,cur) \
-        (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i))
-#define XFS_INOBT_PTR_ADDR(bb,i,cur) \
-        (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \
-                                i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
-                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
 */
-extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
 /*
- * Update the record referred to by cur, to the value given
+ * Record, key, and pointer address macros for btree blocks.
- * by [ino, fcnt, free].
+ *
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * (note that some of these may appear unused, but they are used in userspace)
- */
+ */
-extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino,
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
-                                __int32_t fcnt, xfs_inofree_t free);
+        ((xfs_inobt_rec_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+        ((xfs_inobt_key_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_inobt_key_t)))
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_inobt_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_inobt_key_t) + \
+                 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 #endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e229e9e001c..e2fb6210d4c 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,281 +38,283 @@
 #include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_btree_trace.h"
+#include "xfs_dir2_trace.h"
 /*
- * Look up an inode by number in the given file system.
+ * Allocate and initialise an xfs_inode.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, attach it to the provided
- * vnode.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and attach the provided vnode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *               for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- *        if known (as by bulkstat), else 0.
 */
-STATIC int
+STATIC struct xfs_inode *
-xfs_iget_core(
+xfs_inode_alloc(
-        struct inode    *inode,
+        struct xfs_mount        *mp,
-        xfs_mount_t     *mp,
+        xfs_ino_t               ino)
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp,
-        xfs_daddr_t     bno)
 {
-        struct inode    *old_inode;
+        struct xfs_inode        *ip;
-        xfs_inode_t     *ip;
-        xfs_inode_t     *iq;
-        int             error;
-        unsigned long   first_index, mask;
-        xfs_perag_t     *pag;
-        xfs_agino_t     agino;
-        /* the radix tree exists only in inode capable AGs */
+        /*
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+         * if this didn't occur in transactions, we could use
-                return EINVAL;
+         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+         * code up to do this anyway.
+         */
+        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+        if (!ip)
+                return NULL;
-        /* get the perag structure and ensure that it's inode capable */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
-        pag = xfs_get_perag(mp, ino);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        if (!pag->pagi_inodeok)
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-                return EINVAL;
+        ASSERT(completion_done(&ip->i_flush));
-        ASSERT(pag->pag_ici_init);
-        agino = XFS_INO_TO_AGINO(mp, ino);
-again:
+        /*
-        read_lock(&pag->pag_ici_lock);
+         * initialise the VFS inode here to get failures
-        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+         * out of the way early.
+         */
+        if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
+        /* initialise the xfs inode */
+        ip->i_ino = ino;
+        ip->i_mount = mp;
+        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+        ip->i_afp = NULL;
+        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+        ip->i_flags = 0;
+        ip->i_update_core = 0;
+        ip->i_update_size = 0;
+        ip->i_delayed_blks = 0;
+        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+        ip->i_size = 0;
+        ip->i_new_size = 0;
+        /*
+         * Initialize inode's trace buffers.
+         */
+#ifdef  XFS_INODE_TRACE
+        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_RW_TRACE
+        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
+#endif
+        return ip;
+}
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip,
+        int                     flags,
+        int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     error = EAGAIN;
+        /*
+         * If INEW is set this inode is being set up
+         * If IRECLAIM is set this inode is being torn down
+         * Pause and try again.
+         */
+        if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+                XFS_STATS_INC(xs_ig_frecycle);
+                goto out_error;
+        }
+        /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+        if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-        if (ip != NULL) {
                /*
-                 * If INEW is set this inode is being set up
+                 * If lookup is racing with unlink, then we should return an
-                 * we need to pause and try again.
+                 * error immediately so we don't remove it from the reclaim
+                 * list and potentially leak the inode.
                 */
-                if (xfs_iflags_test(ip, XFS_INEW)) {
+                if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                        read_unlock(&pag->pag_ici_lock);
+                        error = ENOENT;
-                        delay(1);
+                        goto out_error;
-                        XFS_STATS_INC(xs_ig_frecycle);
-                        goto again;
                }
-                old_inode = ip->i_vnode;
+                xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-                if (old_inode == NULL) {
-                        /*
-                         * If IRECLAIM is set this inode is
-                         * on its way out of the system,
-                         * we need to pause and try again.
-                         */
-                        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                delay(1);
-                                XFS_STATS_INC(xs_ig_frecycle);
-                                goto again;
-                        }
-                        ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-                        /*
-                         * If lookup is racing with unlink, then we
-                         * should return an error immediately so we
-                         * don't remove it from the reclaim list and
-                         * potentially leak the inode.
-                         */
-                        if ((ip->i_d.di_mode == 0) &&
-                            !(flags & XFS_IGET_CREATE)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                xfs_put_perag(mp, pag);
-                                return ENOENT;
-                        }
-                        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-                        XFS_STATS_INC(xs_ig_found);
-                        xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                        read_unlock(&pag->pag_ici_lock);
-                        XFS_MOUNT_ILOCK(mp);
-                        list_del_init(&ip->i_reclaim);
-                        XFS_MOUNT_IUNLOCK(mp);
-                        goto finish_inode;
-                } else if (inode != old_inode) {
-                        /* The inode is being torn down, pause and
-                         * try again.
-                         */
-                        if (old_inode->i_state & (I_FREEING | I_CLEAR)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                delay(1);
-                                XFS_STATS_INC(xs_ig_frecycle);
-                                goto again;
-                        }
-/* Chances are the other vnode (the one in the inode) is being torn
-* down right now, and we landed on top of it. Question is, what do
-* we do? Unhook the old inode and hook up the new one?
-*/
-                        cmn_err(CE_PANIC,
-                "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
-                                        old_inode, inode);
-                }
                /*
-                 * Inode cache hit
+                 * We need to re-initialise the VFS inode as it has been
+                 * 'freed' by the VFS. Do this here so we can deal with
+                 * errors cleanly, then tag it so it can be set up correctly
+                 * later.
                 */
-                read_unlock(&pag->pag_ici_lock);
+                if (!inode_init_always(mp->m_super, VFS_I(ip))) {
-                XFS_STATS_INC(xs_ig_found);
+                        error = ENOMEM;
+                        goto out_error;
-finish_inode:
-                if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                        xfs_put_perag(mp, pag);
-                        return ENOENT;
                }
-                if (lock_flags != 0)
+                /*
-                        xfs_ilock(ip, lock_flags);
+                 * We must set the XFS_INEW flag before clearing the
+                 * XFS_IRECLAIMABLE flag so that if a racing lookup does
+                 * not find the XFS_IRECLAIMABLE above but has the igrab()
+                 * below succeed we can safely check XFS_INEW to detect
+                 * that this inode is still being initialised.
+                 */
+                xfs_iflags_set(ip, XFS_INEW);
+                xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+                /* clear the radix tree reclaim flag as well. */
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+        } else if (!igrab(VFS_I(ip))) {
+                /* If the VFS inode is being torn down, pause and try again. */
+                XFS_STATS_INC(xs_ig_frecycle);
+                goto out_error;
+        } else if (xfs_iflags_test(ip, XFS_INEW)) {
+                /*
+                 * We are racing with another cache hit that is
+                 * currently recycling this inode out of the XFS_IRECLAIMABLE
+                 * state. Wait for the initialisation to complete before
+                 * continuing.
+                 */
+                wait_on_inode(VFS_I(ip));
+        }
-                xfs_iflags_clear(ip, XFS_ISTALE);
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                xfs_itrace_exit_tag(ip, "xfs_iget.found");
+                error = ENOENT;
-                goto return_ip;
+                iput(VFS_I(ip));
+                goto out_error;
        }
-        /*
+        /* We've got a live one. */
-         * Inode cache miss
-         */
        read_unlock(&pag->pag_ici_lock);
-        XFS_STATS_INC(xs_ig_missed);
-        /*
+        if (lock_flags != 0)
-         * Read the disk inode attributes into a new inode structure and get
+                xfs_ilock(ip, lock_flags);
-         * a new vnode for it. This should also initialize i_ino and i_mount.
-         */
-        error = xfs_iread(mp, tp, ino, &ip, bno,
-                          (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
-        if (error) {
-                xfs_put_perag(mp, pag);
-                return error;
-        }
-        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+        xfs_iflags_clear(ip, XFS_ISTALE);
+        xfs_itrace_exit_tag(ip, "xfs_iget.found");
+        XFS_STATS_INC(xs_ig_found);
+        return 0;
+out_error:
+        read_unlock(&pag->pag_ici_lock);
+        return error;
+}
-        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+static int
-                     "xfsino", ip->i_ino);
+xfs_iget_cache_miss(
-        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        struct xfs_mount        *mp,
-        init_waitqueue_head(&ip->i_ipin_wait);
+        struct xfs_perag        *pag,
-        atomic_set(&ip->i_pincount, 0);
+        xfs_trans_t             *tp,
+        xfs_ino_t               ino,
+        struct xfs_inode        **ipp,
+        xfs_daddr_t             bno,
+        int                     flags,
+        int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+        struct xfs_inode        *ip;
+        int                     error;
+        unsigned long           first_index, mask;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
-        /*
+        ip = xfs_inode_alloc(mp, ino);
-         * Because we want to use a counting completion, complete
+        if (!ip)
-         * the flush completion once to allow a single access to
+                return ENOMEM;
-         * the flush completion without blocking.
-         */
-        init_completion(&ip->i_flush);
-        complete(&ip->i_flush);
-        if (lock_flags)
+        error = xfs_iread(mp, tp, ip, bno, flags);
-                xfs_ilock(ip, lock_flags);
+        if (error)
+                goto out_destroy;
+        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                xfs_idestroy(ip);
+                error = ENOENT;
-                xfs_put_perag(mp, pag);
+                goto out_destroy;
-                return ENOENT;
        }
+        if (lock_flags)
+                xfs_ilock(ip, lock_flags);
        /*
         * Preload the radix tree so we can insert safely under the
-         * write spinlock.
+         * write spinlock. Note that we cannot sleep inside the preload
+         * region.
         */
        if (radix_tree_preload(GFP_KERNEL)) {
-                xfs_idestroy(ip);
+                error = EAGAIN;
-                delay(1);
+                goto out_unlock;
-                goto again;
        }
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
-        /*
-         * insert the new inode
+        /* insert the new inode */
-         */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
        if (unlikely(error)) {
-                BUG_ON(error != -EEXIST);
+                WARN_ON(error != -EEXIST);
-                write_unlock(&pag->pag_ici_lock);
-                radix_tree_preload_end();
-                xfs_idestroy(ip);
                XFS_STATS_INC(xs_ig_dup);
-                goto again;
+                error = EAGAIN;
+                goto out_preload_end;
        }
-        /*
+        /* These values _must_ be set before releasing the radix tree lock! */
-         * These values _must_ be set before releasing the radix tree lock!
-         */
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-        /*
-         * Link ip to its mount and thread it on the mount's inode list.
-         */
-        XFS_MOUNT_ILOCK(mp);
-        if ((iq = mp->m_inodes)) {
-                ASSERT(iq->i_mprev->i_mnext == iq);
-                ip->i_mprev = iq->i_mprev;
-                iq->i_mprev->i_mnext = ip;
-                iq->i_mprev = ip;
-                ip->i_mnext = iq;
-        } else {
-                ip->i_mnext = ip;
-                ip->i_mprev = ip;
-        }
-        mp->m_inodes = ip;
-        XFS_MOUNT_IUNLOCK(mp);
-        xfs_put_perag(mp, pag);
- return_ip:
-        ASSERT(ip->i_df.if_ext_max ==
-               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
-        xfs_iflags_set(ip, XFS_IMODIFIED);
        *ipp = ip;
-        /*
-         * Set up the Linux with the Linux inode.
-         */
-        ip->i_vnode = inode;
-        inode->i_private = ip;
-        /*
-         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         */
-        if (ip->i_d.di_mode != 0)
-                xfs_setup_inode(ip);
        return 0;
-}
+out_preload_end:
+        write_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+out_unlock:
+        if (lock_flags)
+                xfs_iunlock(ip, lock_flags);
+out_destroy:
+        xfs_destroy_inode(ip);
+        return error;
+}
 /*
- * The 'normal' internal xfs_iget, if needed it will
+ * Look up an inode by number in the given file system.
- * 'allocate', or 'get', the vnode.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *               for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *        if known (as by bulkstat), else 0.
 */
 int
 xfs_iget(
@@ -324,61 +326,64 @@ xfs_iget(
        xfs_inode_t     **ipp,
        xfs_daddr_t     bno)
 {
-        struct inode    *inode;
        xfs_inode_t     *ip;
        int             error;
+        xfs_perag_t     *pag;
+        xfs_agino_t     agino;
-        XFS_STATS_INC(xs_ig_attempts);
+        /* the radix tree exists only in inode capable AGs */
+        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+                return EINVAL;
-retry:
+        /* get the perag structure and ensure that it's inode capable */
-        inode = iget_locked(mp->m_super, ino);
+        pag = xfs_get_perag(mp, ino);
-        if (!inode)
+        if (!pag->pagi_inodeok)
-                /* If we got no inode we are out of memory */
+                return EINVAL;
-                return ENOMEM;
+        ASSERT(pag->pag_ici_init);
+        agino = XFS_INO_TO_AGINO(mp, ino);
-        if (inode->i_state & I_NEW) {
+again:
-                XFS_STATS_INC(vn_active);
+        error = 0;
-                XFS_STATS_INC(vn_alloc);
+        read_lock(&pag->pag_ici_lock);
+        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-                error = xfs_iget_core(inode, mp, tp, ino, flags,
-                                lock_flags, ipp, bno);
+        if (ip) {
-                if (error) {
+                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
-                        make_bad_inode(inode);
+                if (error)
-                        if (inode->i_state & I_NEW)
+                        goto out_error_or_again;
-                                unlock_new_inode(inode);
+        } else {
-                        iput(inode);
+                read_unlock(&pag->pag_ici_lock);
-                }
+                XFS_STATS_INC(xs_ig_missed);
-                return error;
+                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+                                                        flags, lock_flags);
+                if (error)
+                        goto out_error_or_again;
        }
+        xfs_put_perag(mp, pag);
+        *ipp = ip;
+        ASSERT(ip->i_df.if_ext_max ==
+               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
-         * If the inode is not fully constructed due to
+         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * filehandle mismatches wait for the inode to go
+         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         * away and try again.
-         *
-         * iget_locked will call __wait_on_freeing_inode
-         * to wait for the inode to go away.
         */
-        if (is_bad_inode(inode)) {
+        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-                iput(inode);
+                xfs_setup_inode(ip);
-                delay(1);
+        return 0;
-                goto retry;
-        }
-        ip = XFS_I(inode);
+out_error_or_again:
-        if (!ip) {
+        if (error == EAGAIN) {
-                iput(inode);
                delay(1);
-                goto retry;
+                goto again;
        }
+        xfs_put_perag(mp, pag);
-        if (lock_flags != 0)
+        return error;
-                xfs_ilock(ip, lock_flags);
-        XFS_STATS_INC(xs_ig_found);
-        *ipp = ip;
-        return 0;
 }
 /*
 * Look for the inode corresponding to the given ino in the hash table.
 * If it is there and its i_transp pointer matches tp, return it.
@@ -444,99 +449,109 @@ xfs_iput_new(
        IRELE(ip);
 }
 /*
- * This routine embodies the part of the reclaim code that pulls
+ * This is called free all the memory associated with an inode.
- * the inode from the inode hash table and the mount structure's
+ * It must free the inode itself and any buffers allocated for
- * inode list.
+ * if_extents/if_data and if_broot.  It must also free the lock
- * This should only be called from xfs_reclaim().
+ * associated with the inode.
+ *
+ * Note: because we don't initialise everything on reallocation out
+ * of the zone, we must ensure we nullify everything correctly before
+ * freeing the structure.
 */
 void
-xfs_ireclaim(xfs_inode_t *ip)
+xfs_ireclaim(
+        struct xfs_inode        *ip)
 {
-        /*
+        struct xfs_mount        *mp = ip->i_mount;
-         * Remove from old hash list and mount list.
+        struct xfs_perag        *pag;
-         */
-        XFS_STATS_INC(xs_ig_reclaims);
-        xfs_iextract(ip);
+        XFS_STATS_INC(xs_ig_reclaims);
-        /*
-         * Here we do a spurious inode lock in order to coordinate with
-         * xfs_sync().  This is because xfs_sync() references the inodes
-         * in the mount list without taking references on the corresponding
-         * vnodes.  We make that OK here by ensuring that we wait until
-         * the inode is unlocked in xfs_sync() before we go ahead and
-         * free it.  We get both the regular lock and the io lock because
-         * the xfs_sync() code may need to drop the regular one but will
-         * still hold the io lock.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        /*
-         * Release dquots (and their references) if any. An inode may escape
-         * xfs_inactive and get here via vn_alloc->vn_reclaim path.
-         */
-        XFS_QM_DQDETACH(ip->i_mount, ip);
-        /*
-         * Pull our behavior descriptor from the vnode chain.
-         */
-        if (ip->i_vnode) {
-                ip->i_vnode->i_private = NULL;
-                ip->i_vnode = NULL;
-        }
        /*
-         * Free all memory associated with the inode.
+         * Remove the inode from the per-AG radix tree.  It doesn't matter
+         * if it was never added to it because radix_tree_delete can deal
+         * with that case just fine.
         */
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        pag = xfs_get_perag(mp, ip->i_ino);
-        xfs_idestroy(ip);
-}
-/*
- * This routine removes an about-to-be-destroyed inode from
- * all of the lists in which it is located with the exception
- * of the behavior chain.
- */
-void
-xfs_iextract(
-        xfs_inode_t     *ip)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
-        xfs_inode_t     *iq;
        write_lock(&pag->pag_ici_lock);
        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
        write_unlock(&pag->pag_ici_lock);
        xfs_put_perag(mp, pag);
        /*
-         * Remove from mount's inode list.
+         * Here we do an (almost) spurious inode lock in order to coordinate
+         * with inode cache radix tree lookups.  This is because the lookup
+         * can reference the inodes in the cache without taking references.
+         *
+         * We make that OK here by ensuring that we wait until the inode is
+         * unlocked after the lookup before we go ahead and free it.  We get
+         * both the ilock and the iolock because the code may need to drop the
+         * ilock one but will still hold the iolock.
         */
-        XFS_MOUNT_ILOCK(mp);
+        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
-        iq = ip->i_mnext;
-        iq->i_mprev = ip->i_mprev;
-        ip->i_mprev->i_mnext = iq;
        /*
-         * Fix up the head pointer if it points to the inode being deleted.
+         * Release dquots (and their references) if any.
         */
-        if (mp->m_inodes == ip) {
+        XFS_QM_DQDETACH(ip->i_mount, ip);
-                if (ip == iq) {
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                        mp->m_inodes = NULL;
-                } else {
+        switch (ip->i_d.di_mode & S_IFMT) {
-                        mp->m_inodes = iq;
+        case S_IFREG:
-                }
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
        }
-        /* Deal with the deleted inodes list */
+        if (ip->i_afp)
-        list_del_init(&ip->i_reclaim);
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        mp->m_ireclaims++;
+#ifdef XFS_INODE_TRACE
-        XFS_MOUNT_IUNLOCK(mp);
+        ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /*
+                 * Only if we are shutting down the fs will we see an
+                 * inode still in the AIL. If it is there, we should remove
+                 * it to prevent a use-after-free from occurring.
+                 */
+                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+                struct xfs_ail  *ailp = lip->li_ailp;
+                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                if (lip->li_flags & XFS_LI_IN_AIL) {
+                        spin_lock(&ailp->xa_lock);
+                        if (lip->li_flags & XFS_LI_IN_AIL)
+                                xfs_trans_ail_delete(ailp, lip);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                }
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(completion_done(&ip->i_flush));
+        kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
@@ -737,7 +752,7 @@ xfs_iunlock(
                 * it is in the AIL and anyone is waiting on it.  Don't do
                 * this if the caller has asked us not to.
                 */
-                xfs_trans_unlocked_item(ip->i_mount,
+                xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
                                        (xfs_log_item_t*)(ip->i_itemp));
        }
        xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
@@ -790,3 +805,51 @@ xfs_isilocked(
 }
 #endif
+#ifdef  XFS_INODE_TRACE
+#define KTRACE_ENTER(ip, vk, s, line, ra)                       \
+        ktrace_enter((ip)->i_trace,                             \
+/*  0 */                (void *)(__psint_t)(vk),                \
+/*  1 */                (void *)(s),                            \
+/*  2 */                (void *)(__psint_t) line,               \
+/*  3 */                (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
+/*  4 */                (void *)(ra),                           \
+/*  5 */                NULL,                                   \
+/*  6 */                (void *)(__psint_t)current_cpu(),       \
+/*  7 */                (void *)(__psint_t)current_pid(),       \
+/*  8 */                (void *)__return_address,               \
+/*  9 */                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+/*
+ * Vnode tracing code.
+ */
+void
+_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
+}
+void
+_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
+}
+void
+xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
+}
+void
+_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
+}
+void
+xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
+}
+#endif  /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
deleted file mode 100644
index d3645000398..00000000000
--- a/fs/xfs/xfs_imap.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IMAP_H__
-#define __XFS_IMAP_H__
-/*
- * This is the structure passed to xfs_imap() to map
- * an inode number to its on disk location.
- */
-typedef struct xfs_imap {
-        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
-        uint            im_len;         /* length in BBs of inode chunk */
-        xfs_agblock_t   im_agblkno;     /* logical block of inode chunk in ag */
-        ushort          im_ioffset;     /* inode offset in block in "inodes" */
-        ushort          im_boffset;     /* inode offset in block in bytes */
-} xfs_imap_t;
-#ifdef __KERNEL__
-struct xfs_mount;
-struct xfs_trans;
-int     xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                 xfs_imap_t *, uint);
-#endif
-#endif  /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a391b955df0..5a5e035e5d3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,7 +23,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_imap.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
@@ -41,6 +40,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
@@ -133,10 +133,10 @@ STATIC int
 xfs_imap_to_bp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-        xfs_imap_t      *imap,
+        struct xfs_imap *imap,
        xfs_buf_t       **bpp,
        uint            buf_flags,
-        uint            imap_flags)
+        uint            iget_flags)
 {
        int             error;
        int             i;
@@ -173,12 +173,12 @@ xfs_imap_to_bp(
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        (i << mp->m_sb.sb_inodelog));
-                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+                di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
-                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+                            XFS_DINODE_GOOD_VERSION(dip->di_version);
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
                                                XFS_ERRTAG_ITOBP_INOTOBP,
                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (imap_flags & XFS_IMAP_BULKSTAT) {
+                        if (iget_flags & XFS_IGET_BULKSTAT) {
                                xfs_trans_brelse(tp, bp);
                                return XFS_ERROR(EINVAL);
                        }
@@ -190,7 +190,7 @@ xfs_imap_to_bp(
                                        "daddr %lld #%d (magic=%x)",
                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
-                                be16_to_cpu(dip->di_core.di_magic));
+                                be16_to_cpu(dip->di_magic));
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -221,25 +221,26 @@ xfs_imap_to_bp(
 * Use xfs_imap() to determine the size and location of the
 * buffer to read from disk.
 */
-STATIC int
+int
 xfs_inotobp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_ino_t       ino,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-        int             *offset)
+        int             *offset,
+        uint            imap_flags)
 {
-        xfs_imap_t      imap;
+        struct xfs_imap imap;
        xfs_buf_t       *bp;
        int             error;
        imap.im_blkno = 0;
-        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+        error = xfs_imap(mp, tp, ino, &imap, imap_flags);
        if (error)
                return error;
-        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
        if (error)
                return error;
@@ -260,15 +261,11 @@ xfs_inotobp(
 * If a non-zero error is returned, then the contents of bpp and
 * dipp are undefined.
 *
- * If the inode is new and has not yet been initialized, use xfs_imap()
+ * The inode is expected to already been mapped to its buffer and read
- * to determine the size and location of the buffer to read from disk.
+ * in once, thus we can use the mapping information stored in the inode
- * If the inode has already been mapped to its buffer and read in once,
+ * rather than calling xfs_imap().  This allows us to avoid the overhead
- * then use the mapping information stored in the inode rather than
+ * of looking at the inode btree for small block file systems
- * calling xfs_imap().  This allows us to avoid the overhead of looking
+ * (see xfs_imap()).
- * at the inode btree for small block file systems (see xfs_dilocate()).
- * We can tell whether the inode has been mapped in before by comparing
- * its disk block address to 0.  Only uninitialized inodes will have
- * 0 for the disk block address.
 */
 int
 xfs_itobp(
@@ -277,40 +274,14 @@ xfs_itobp(
        xfs_inode_t     *ip,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-        xfs_daddr_t     bno,
-        uint            imap_flags,
        uint            buf_flags)
 {
-        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        if (ip->i_blkno == (xfs_daddr_t)0) {
+        ASSERT(ip->i_imap.im_blkno != 0);
-                imap.im_blkno = bno;
-                error = xfs_imap(mp, tp, ip->i_ino, &imap,
-                                        XFS_IMAP_LOOKUP | imap_flags);
-                if (error)
-                        return error;
-                /*
+        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
-                 * Fill in the fields in the inode that will be used to
-                 * map the inode to its buffer from now on.
-                 */
-                ip->i_blkno = imap.im_blkno;
-                ip->i_len = imap.im_len;
-                ip->i_boffset = imap.im_boffset;
-        } else {
-                /*
-                 * We've already mapped the inode once, so just use the
-                 * mapping that we saved the first time.
-                 */
-                imap.im_blkno = ip->i_blkno;
-                imap.im_len = ip->i_len;
-                imap.im_boffset = ip->i_boffset;
-        }
-        ASSERT(bno == 0 || bno == imap.im_blkno);
-        error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
        if (error)
                return error;
@@ -321,7 +292,7 @@ xfs_itobp(
                return EAGAIN;
        }
-        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
        *bpp = bp;
        return 0;
 }
@@ -348,26 +319,26 @@ xfs_iformat(
                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        error = 0;
-        if (unlikely(be32_to_cpu(dip->di_core.di_nextents) +
+        if (unlikely(be32_to_cpu(dip->di_nextents) +
-                     be16_to_cpu(dip->di_core.di_anextents) >
+                     be16_to_cpu(dip->di_anextents) >
-                     be64_to_cpu(dip->di_core.di_nblocks))) {
+                     be64_to_cpu(dip->di_nblocks))) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
-                        (int)(be32_to_cpu(dip->di_core.di_nextents) +
+                        (int)(be32_to_cpu(dip->di_nextents) +
-                              be16_to_cpu(dip->di_core.di_anextents)),
+                              be16_to_cpu(dip->di_anextents)),
                        (unsigned long long)
-                                be64_to_cpu(dip->di_core.di_nblocks));
+                                be64_to_cpu(dip->di_nblocks));
                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
-        if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
-                        dip->di_core.di_forkoff);
+                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
@@ -378,25 +349,25 @@ xfs_iformat(
        case S_IFCHR:
        case S_IFBLK:
        case S_IFSOCK:
-                if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) {
+                if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
                                              ip->i_mount, dip);
                        return XFS_ERROR(EFSCORRUPTED);
                }
                ip->i_d.di_size = 0;
                ip->i_size = 0;
-                ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev);
+                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
                break;
        case S_IFREG:
        case S_IFLNK:
        case S_IFDIR:
-                switch (dip->di_core.di_format) {
+                switch (dip->di_format) {
                case XFS_DINODE_FMT_LOCAL:
                        /*
                         * no local regular files yet
                         */
-                        if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) {
+                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
                                        "(local format for regular file).",
@@ -407,7 +378,7 @@ xfs_iformat(
                                return XFS_ERROR(EFSCORRUPTED);
                        }
-                        di_size = be64_to_cpu(dip->di_core.di_size);
+                        di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
@@ -449,7 +420,7 @@ xfs_iformat(
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
        ip->i_afp->if_ext_max =
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-        switch (dip->di_core.di_aformat) {
+        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
                size = be16_to_cpu(atp->hdr.totsize);
@@ -621,7 +592,7 @@ xfs_iformat_btree(
        ifp = XFS_IFORK_PTR(ip, whichfork);
        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
        size = XFS_BMAP_BROOT_SPACE(dfp);
-        nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+        nrecs = be16_to_cpu(dfp->bb_numrecs);
        /*
         * blow out if -- fork has less extents than can fit in
@@ -649,8 +620,9 @@ xfs_iformat_btree(
         * Copy and convert from the on-disk structure
         * to the in-memory structure.
         */
-        xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+        xfs_bmdr_to_bmbt(ip->i_mount, dfp,
-                ifp->if_broot, size);
+                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                         ifp->if_broot, size);
        ifp->if_flags &= ~XFS_IFEXTENTS;
        ifp->if_flags |= XFS_IFBROOT;
@@ -660,7 +632,7 @@ xfs_iformat_btree(
 void
 xfs_dinode_from_disk(
        xfs_icdinode_t          *to,
-        xfs_dinode_core_t       *from)
+        xfs_dinode_t            *from)
 {
        to->di_magic = be16_to_cpu(from->di_magic);
        to->di_mode = be16_to_cpu(from->di_mode);
@@ -694,7 +666,7 @@ xfs_dinode_from_disk(
 void
 xfs_dinode_to_disk(
-        xfs_dinode_core_t       *to,
+        xfs_dinode_t            *to,
        xfs_icdinode_t          *from)
 {
        to->di_magic = cpu_to_be16(from->di_magic);
@@ -781,93 +753,57 @@ uint
 xfs_dic2xflags(
        xfs_dinode_t            *dip)
 {
-        xfs_dinode_core_t       *dic = &dip->di_core;
+        return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
-        return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 }
 /*
- * Given a mount structure and an inode number, return a pointer
+ * Read the disk inode attributes into the in-core inode structure.
- * to a newly allocated in-core inode corresponding to the given
- * inode number.
- *
- * Initialize the inode's attributes and extent pointers if it
- * already has them (it will not if the inode has no links).
 */
 int
 xfs_iread(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
+        xfs_inode_t     *ip,
-        xfs_inode_t     **ipp,
        xfs_daddr_t     bno,
-        uint            imap_flags)
+        uint            iget_flags)
 {
        xfs_buf_t       *bp;
        xfs_dinode_t    *dip;
-        xfs_inode_t     *ip;
        int             error;
-        ASSERT(xfs_inode_zone != NULL);
-        ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
-        ip->i_ino = ino;
-        ip->i_mount = mp;
-        atomic_set(&ip->i_iocount, 0);
-        spin_lock_init(&ip->i_flags_lock);
        /*
-         * Get pointer's to the on-disk inode and the buffer containing it.
+         * Fill in the location information in the in-core inode.
-         * If the inode number refers to a block outside the file system
-         * then xfs_itobp() will return NULL.  In this case we should
-         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
-         * know that this is a new incore inode.
         */
-        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
+        ip->i_imap.im_blkno = bno;
-        if (error) {
+        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
-                kmem_zone_free(xfs_inode_zone, ip);
+        if (error)
                return error;
-        }
+        ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
        /*
-         * Initialize inode's trace buffers.
+         * Get pointers to the on-disk inode and the buffer containing it.
-         * Do this before xfs_iformat in case it adds entries.
         */
-#ifdef  XFS_INODE_TRACE
+        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
-        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+                               XFS_BUF_LOCK, iget_flags);
-#endif
+        if (error)
-#ifdef XFS_BMAP_TRACE
+                return error;
-        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
-#endif
-#ifdef XFS_BMBT_TRACE
-        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_RW_TRACE
-        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
-#endif
        /*
         * If we got something that isn't an inode it means someone
         * (nfs or dmi) has a stale handle.
         */
-        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
+        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
-                kmem_zone_free(xfs_inode_zone, ip);
-                xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
-                                "dip->di_core.di_magic (0x%x) != "
+                                "dip->di_magic (0x%x) != "
                                "XFS_DINODE_MAGIC (0x%x)",
-                                be16_to_cpu(dip->di_core.di_magic),
+                                be16_to_cpu(dip->di_magic),
                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
-                return XFS_ERROR(EINVAL);
+                error = XFS_ERROR(EINVAL);
+                goto out_brelse;
        }
        /*
@@ -877,24 +813,22 @@ xfs_iread(
         * specific information.
         * Otherwise, just get the truly permanent information.
         */
-        if (dip->di_core.di_mode) {
+        if (dip->di_mode) {
-                xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
+                xfs_dinode_from_disk(&ip->i_d, dip);
                error = xfs_iformat(ip, dip);
                if (error)  {
-                        kmem_zone_free(xfs_inode_zone, ip);
-                        xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
                                        "xfs_iformat() returned error %d",
                                        error);
 #endif /* DEBUG */
-                        return error;
+                        goto out_brelse;
                }
        } else {
-                ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
+                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
-                ip->i_d.di_version = dip->di_core.di_version;
+                ip->i_d.di_version = dip->di_version;
-                ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen);
+                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
-                ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter);
+                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
                /*
                 * Make sure to pull in the mode here as well in
                 * case the inode is released without being used.
@@ -911,8 +845,6 @@ xfs_iread(
                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
-        INIT_LIST_HEAD(&ip->i_reclaim);
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -924,7 +856,7 @@ xfs_iread(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+        if (ip->i_d.di_version == 1) {
                ip->i_d.di_nlink = ip->i_d.di_onlink;
                ip->i_d.di_onlink = 0;
                ip->i_d.di_projid = 0;
@@ -938,7 +870,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-         XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        XFS_BUF_SET_REF(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -953,9 +885,9 @@ xfs_iread(
         * to worry about the inode being changed just because we released
         * the buffer.
         */
+ out_brelse:
        xfs_trans_brelse(tp, bp);
-        *ipp = ip;
+        return error;
-        return 0;
 }
 /*
@@ -1049,6 +981,7 @@ xfs_ialloc(
        uint            flags;
        int             error;
        timespec_t      tv;
+        int             filestreams = 0;
        /*
         * Call the space management code to pick
@@ -1056,9 +989,8 @@ xfs_ialloc(
         */
        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
                            ialloc_context, call_again, &ino);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        if (*call_again || ino == NULLFSINO) {
                *ipp = NULL;
                return 0;
@@ -1072,9 +1004,8 @@ xfs_ialloc(
         */
        error = xfs_trans_iget(tp->t_mountp, tp, ino,
                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        ASSERT(ip != NULL);
        ip->i_d.di_mode = (__uint16_t)mode;
@@ -1093,8 +1024,8 @@ xfs_ialloc(
         * here rather than here and in the flush/logging code.
         */
        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
-            ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+            ip->i_d.di_version == 1) {
-                ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                ip->i_d.di_version = 2;
                /*
                 * We've already zeroed the old link count, the projid field,
                 * and the pad field.
@@ -1104,7 +1035,7 @@ xfs_ialloc(
        /*
         * Project ids won't be stored on disk if we are using a version 1 inode.
         */
-        if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+        if ((prid != 0) && (ip->i_d.di_version == 1))
                xfs_bump_ino_vers2(tp, ip);
        if (pip && XFS_INHERIT_GID(pip)) {
@@ -1155,13 +1086,12 @@ xfs_ialloc(
                flags |= XFS_ILOG_DEV;
                break;
        case S_IFREG:
-                if (pip && xfs_inode_is_filestream(pip)) {
+                /*
-                        error = xfs_filestream_associate(pip, ip);
+                 * we can't set up filestreams until after the VFS inode
-                        if (error < 0)
+                 * is set up properly.
-                                return -error;
+                 */
-                        if (!error)
+                if (pip && xfs_inode_is_filestream(pip))
-                                xfs_iflags_set(ip, XFS_IFILESTREAM);
+                        filestreams = 1;
-                }
                /* fall through */
        case S_IFDIR:
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
@@ -1227,6 +1157,15 @@ xfs_ialloc(
        /* now that we have an i_mode we can setup inode ops and unlock */
        xfs_setup_inode(ip);
+        /* now we have set up the vfs inode we can associate the filestream */
+        if (filestreams) {
+                error = xfs_filestream_associate(pip, ip);
+                if (error < 0)
+                        return -error;
+                if (!error)
+                        xfs_iflags_set(ip, XFS_IFILESTREAM);
+        }
        *ipp = ip;
        return 0;
 }
@@ -1383,8 +1322,8 @@ xfs_itrunc_trace(
 * direct I/O with the truncate operation.  Also, because we hold
 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
 * started until the truncate completes and drops the lock. Essentially,
- * the vn_iowait() call forms an I/O barrier that provides strict ordering
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
- * between direct I/Os and the truncate operation.
+ * ordering between direct I/Os and the truncate operation.
 *
 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
 * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
@@ -1415,7 +1354,7 @@ xfs_itruncate_start(
        /* wait for the completion of any pending DIOs */
        if (new_size == 0 || new_size < ip->i_size)
-                vn_iowait(ip);
+                xfs_ioend_wait(ip);
        /*
         * Call toss_pages or flushinval_pages to get rid of pages
@@ -1726,8 +1665,14 @@ xfs_itruncate_finish(
                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
                xfs_trans_ihold(ntp, ip);
-                if (!error)
+                if (error)
-                        error = xfs_trans_reserve(ntp, 0,
+                        return error;
+                /*
+                 * transaction commit worked ok so we can drop the extra ticket
+                 * reference that we gained in xfs_trans_dup()
+                 */
+                xfs_log_ticket_put(ntp->t_ticket);
+                error = xfs_trans_reserve(ntp, 0,
                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
                                        XFS_TRANS_PERM_LOG_RES,
                                        XFS_ITRUNCATE_LOG_COUNT);
@@ -1781,13 +1726,10 @@ xfs_iunlink(
        xfs_dinode_t    *dip;
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
-        xfs_agnumber_t  agno;
-        xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        short           bucket_index;
        int             offset;
        int             error;
-        int             agi_ok;
        ASSERT(ip->i_d.di_nlink == 0);
        ASSERT(ip->i_d.di_mode != 0);
@@ -1795,31 +1737,15 @@ xfs_iunlink(
        mp = tp->t_mountp;
-        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+        error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
        if (error)
                return error;
-        /*
-         * Validate the magic number of the agi block.
-         */
        agi = XFS_BUF_TO_AGI(agibp);
-        agi_ok =
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
-                        XFS_RANDOM_IUNLINK))) {
-                XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
-                xfs_trans_brelse(tp, agibp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1837,14 +1763,14 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error)
                        return error;
                ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
                /* both on-disk, don't endian flip twice */
                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-                offset = ip->i_boffset +
+                offset = ip->i_imap.im_boffset +
                        offsetof(xfs_dinode_t, di_next_unlinked);
                xfs_trans_inode_buf(tp, ibp);
                xfs_trans_log_buf(tp, ibp, offset,
@@ -1879,7 +1805,6 @@ xfs_iunlink_remove(
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
        xfs_agnumber_t  agno;
-        xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        xfs_agino_t     next_agino;
        xfs_buf_t       *last_ibp;
@@ -1887,45 +1812,20 @@ xfs_iunlink_remove(
        short           bucket_index;
        int             offset, last_offset = 0;
        int             error;
-        int             agi_ok;
-        /*
-         * First pull the on-disk inode from the AGI unlinked list.
-         */
        mp = tp->t_mountp;
        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+        error = xfs_read_agi(mp, tp, agno, &agibp);
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+        if (error)
-        if (error) {
-                cmn_err(CE_WARN,
-                        "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
-                        error, mp->m_fsname);
                return error;
-        }
-        /*
-         * Validate the magic number of the agi block.
-         */
        agi = XFS_BUF_TO_AGI(agibp);
-        agi_ok =
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
-                        XFS_RANDOM_IUNLINK_REMOVE))) {
-                XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
-                                     mp, agi);
-                xfs_trans_brelse(tp, agibp);
-                cmn_err(CE_WARN,
-                        "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
-                         mp->m_fsname);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1945,7 +1845,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1956,7 +1856,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != 0);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        offset = ip->i_boffset +
+                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -1992,7 +1892,7 @@ xfs_iunlink_remove(
                        }
                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
-                                            &last_ibp, &last_offset);
+                                            &last_ibp, &last_offset, 0);
                        if (error) {
                                cmn_err(CE_WARN,
                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
@@ -2007,7 +1907,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2019,7 +1919,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != agino);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        offset = ip->i_boffset +
+                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -2160,9 +2060,9 @@ xfs_ifree_cluster(
                                iip = (xfs_inode_log_item_t *)lip;
                                ASSERT(iip->ili_logged == 1);
                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                                spin_lock(&mp->m_ail_lock);
+                                xfs_trans_ail_copy_lsn(mp->m_ail,
-                                iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                                                        &iip->ili_flush_lsn,
-                                spin_unlock(&mp->m_ail_lock);
+                                                        &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
                                pre_flushed++;
                        }
@@ -2183,9 +2083,8 @@ xfs_ifree_cluster(
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
                        iip->ili_logged = 1;
-                        spin_lock(&mp->m_ail_lock);
+                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                        iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                                                &iip->ili_item.li_lsn);
-                        spin_unlock(&mp->m_ail_lock);
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
@@ -2263,7 +2162,7 @@ xfs_ifree(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
        if (error)
                return error;
@@ -2279,7 +2178,7 @@ xfs_ifree(
        * This is a temporary hack that would require a proper fix
        * in the future.
        */
-        dip->di_core.di_mode = 0;
+        dip->di_mode = 0;
        if (delete) {
                xfs_ifree_cluster(ip, tp, first_ino);
@@ -2312,9 +2211,10 @@ xfs_iroot_realloc(
        int                     rec_diff,
        int                     whichfork)
 {
+        struct xfs_mount        *mp = ip->i_mount;
        int                     cur_max;
        xfs_ifork_t             *ifp;
-        xfs_bmbt_block_t        *new_broot;
+        struct xfs_btree_block  *new_broot;
        int                     new_max;
        size_t                  new_size;
        char                    *np;
@@ -2335,8 +2235,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-                        ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
+                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
-                                                                     KM_SLEEP);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -2347,18 +2246,16 @@ xfs_iroot_realloc(
                 * location.  The records don't change location because
                 * they are kept butted up against the btree block header.
                 */
-                cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+                cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
                new_max = cur_max + rec_diff;
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
-                ifp->if_broot = (xfs_bmbt_block_t *)
+                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                  kmem_realloc(ifp->if_broot,
-                                new_size,
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
                                KM_SLEEP);
-                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                      ifp->if_broot_bytes);
+                                                     ifp->if_broot_bytes);
-                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                      (int)new_size);
+                                                     (int)new_size);
                ifp->if_broot_bytes = (int)new_size;
                ASSERT(ifp->if_broot_bytes <=
                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
@@ -2372,7 +2269,7 @@ xfs_iroot_realloc(
         * records, just get rid of the root and clear the status bit.
         */
        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-        cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+        cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
        new_max = cur_max + rec_diff;
        ASSERT(new_max >= 0);
        if (new_max > 0)
@@ -2380,11 +2277,11 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-                new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+                new_broot = kmem_alloc(new_size, KM_SLEEP);
                /*
                 * First copy over the btree block header.
                 */
-                memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
+                memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
        } else {
                new_broot = NULL;
                ifp->if_flags &= ~XFS_IFBROOT;
@@ -2397,18 +2294,16 @@ xfs_iroot_realloc(
                /*
                 * First copy the records.
                 */
-                op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-                                                     ifp->if_broot_bytes);
+                np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
-                np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
-                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
                /*
                 * Then copy the pointers.
                 */
-                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
-                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
        }
@@ -2511,64 +2406,6 @@ xfs_idata_realloc(
        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 }
-/*
- * Map inode to disk block and offset.
- *
- * mp -- the mount point structure for the current file system
- * tp -- the current transaction
- * ino -- the inode number of the inode to be located
- * imap -- this structure is filled in with the information necessary
- *       to retrieve the given inode from disk
- * flags -- flags to pass to xfs_dilocate indicating whether or not
- *       lookups in the inode btree were OK or not
- */
-int
-xfs_imap(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        xfs_imap_t      *imap,
-        uint            flags)
-{
-        xfs_fsblock_t   fsbno;
-        int             len;
-        int             off;
-        int             error;
-        fsbno = imap->im_blkno ?
-                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
-        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-        if (error)
-                return error;
-        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
-        imap->im_len = XFS_FSB_TO_BB(mp, len);
-        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
-        imap->im_ioffset = (ushort)off;
-        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
-        /*
-         * If the inode number maps to a block outside the bounds
-         * of the file system then return NULL rather than calling
-         * read_buf and panicing when we get an error from the
-         * driver.
-         */
-        if ((imap->im_blkno + imap->im_len) >
-            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
-                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
-                        (unsigned long long) imap->im_blkno,
-                        (unsigned long long) imap->im_len,
-                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-                return EINVAL;
-        }
-        return 0;
-}
 void
 xfs_idestroy_fork(
        xfs_inode_t     *ip,
@@ -2613,70 +2450,6 @@ xfs_idestroy_fork(
 }
 /*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- */
-void
-xfs_idestroy(
-        xfs_inode_t     *ip)
-{
-        switch (ip->i_d.di_mode & S_IFMT) {
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        mrfree(&ip->i_lock);
-        mrfree(&ip->i_iolock);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BMBT_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
-        if (ip->i_itemp) {
-                /*
-                 * Only if we are shutting down the fs will we see an
-                 * inode still in the AIL. If it is there, we should remove
-                 * it to prevent a use-after-free from occurring.
-                 */
-                xfs_mount_t     *mp = ip->i_mount;
-                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
-                if (lip->li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&mp->m_ail_lock);
-                        if (lip->li_flags & XFS_LI_IN_AIL)
-                                xfs_trans_delete_ail(mp, lip);
-                        else
-                                spin_unlock(&mp->m_ail_lock);
-                }
-                xfs_inode_item_destroy(ip);
-        }
-        kmem_zone_free(xfs_inode_zone, ip);
-}
-/*
 * Increment the pin count of the given buffer.
 * This value is protected by ipinlock spinlock in the mount structure.
 */
@@ -2880,7 +2653,7 @@ xfs_iflush_fork(
                        ASSERT(ifp->if_broot_bytes <=
                               (XFS_IFORK_SIZE(ip, whichfork) +
                                XFS_BROOT_SIZE_ADJ));
-                        xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+                        xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
                                (xfs_bmdr_block_t *)cp,
                                XFS_DFORK_SIZE(dip, mp, whichfork));
                }
@@ -2889,15 +2662,16 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_DEV:
                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                        dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev);
+                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
                }
                break;
        case XFS_DINODE_FMT_UUID:
                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                        memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
+                        memcpy(XFS_DFORK_DPTR(dip),
-                                sizeof(uuid_t));
+                               &ip->i_df.if_u2.if_uuid,
+                               sizeof(uuid_t));
                }
                break;
@@ -3030,7 +2804,6 @@ cluster_corrupt_out:
                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
-                        XFS_BUF_SHUT(bp);
                        XFS_BUF_ERROR(bp,EIO);
                        xfs_biodone(bp);
                } else {
@@ -3172,7 +2945,7 @@ xfs_iflush(
        /*
         * Get the buffer containing the on-disk inode.
         */
-        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
@@ -3253,7 +3026,7 @@ xfs_iflush_int(
        }
        /* set *dip = inode's place in the buffer */
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
        /*
         * Clear i_update_core before copying out the data.
@@ -3275,11 +3048,11 @@ xfs_iflush_int(
         */
        xfs_synchronize_atime(ip);
-        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC,
+        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                        ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip);
+                        ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
@@ -3342,7 +3115,7 @@ xfs_iflush_int(
         * because if the inode is dirty at all the core must
         * be.
         */
-        xfs_dinode_to_disk(&dip->di_core, &ip->i_d);
+        xfs_dinode_to_disk(dip, &ip->i_d);
        /* Wrap, we never let the log put out DI_MAX_FLUSH */
        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3354,28 +3127,27 @@ xfs_iflush_int(
         * convert back to the old inode format.  If the superblock version
         * has been updated, then make the conversion permanent.
         */
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-               xfs_sb_version_hasnlink(&mp->m_sb));
+        if (ip->i_d.di_version == 1) {
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
                         */
                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
-                        dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink);
+                        dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
                } else {
                        /*
                         * The superblock version has already been bumped,
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        ip->i_d.di_version = 2;
-                        dip->di_core.di_version =  XFS_DINODE_VERSION_2;
+                        dip->di_version = 2;
                        ip->i_d.di_onlink = 0;
-                        dip->di_core.di_onlink = 0;
+                        dip->di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-                        memset(&(dip->di_core.di_pad[0]), 0,
+                        memset(&(dip->di_pad[0]), 0,
-                              sizeof(dip->di_core.di_pad));
+                              sizeof(dip->di_pad));
                        ASSERT(ip->i_d.di_projid == 0);
                }
        }
@@ -3418,10 +3190,8 @@ xfs_iflush_int(
                iip->ili_format.ilf_fields = 0;
                iip->ili_logged = 1;
-                ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+                xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                spin_lock(&mp->m_ail_lock);
+                                        &iip->ili_item.li_lsn);
-                iip->ili_flush_lsn = iip->ili_item.li_lsn;
-                spin_unlock(&mp->m_ail_lock);
                /*
                 * Attach the function xfs_iflush_done to the inode's
@@ -3459,45 +3229,8 @@ corrupt_out:
 }
-/*
- * Flush all inactive inodes in mp.
- */
-void
-xfs_iflush_all(
-        xfs_mount_t     *mp)
-{
-        xfs_inode_t     *ip;
- again:
-        XFS_MOUNT_ILOCK(mp);
-        ip = mp->m_inodes;
-        if (ip == NULL)
-                goto out;
-        do {
-                /* Make sure we skip markers inserted by sync */
-                if (ip->i_mount == NULL) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                if (!VFS_I(ip)) {
-                        XFS_MOUNT_IUNLOCK(mp);
-                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
-                        goto again;
-                }
-                ASSERT(vn_count(VFS_I(ip)) == 0);
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
- out:
-        XFS_MOUNT_IUNLOCK(mp);
-}
 #ifdef XFS_ILOCK_TRACE
-ktrace_t        *xfs_ilock_trace_buf;
 void
 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1420c49674d..1f175fa34b2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -19,8 +19,7 @@
 #define __XFS_INODE_H__
 struct xfs_dinode;
-struct xfs_dinode_core;
+struct xfs_inode;
 /*
 * Fork identifiers.
@@ -63,7 +62,7 @@ typedef struct xfs_ext_irec {
 typedef struct xfs_ifork {
        int                     if_bytes;       /* bytes in if_u1 */
        int                     if_real_bytes;  /* bytes allocated in if_u1 */
-        xfs_bmbt_block_t        *if_broot;      /* file's incore btree root */
+        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
@@ -84,52 +83,14 @@ typedef struct xfs_ifork {
 } xfs_ifork_t;
 /*
- * Flags for xfs_ichgtime().
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
 */
-#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+struct xfs_imap {
-#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+        ushort          im_len;         /* length in BBs of inode chunk */
-/*
+        ushort          im_boffset;     /* inode offset in block in bytes */
- * Per-fork incore inode flags.
+};
- */
-#define XFS_IFINLINE    0x01    /* Inline data is read in */
-#define XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-/*
- * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate().
- */
-#define XFS_IMAP_LOOKUP         0x1
-#define XFS_IMAP_BULKSTAT       0x2
-#ifdef __KERNEL__
-struct bhv_desc;
-struct cred;
-struct ktrace;
-struct xfs_buf;
-struct xfs_bmap_free;
-struct xfs_bmbt_irec;
-struct xfs_bmbt_block;
-struct xfs_inode;
-struct xfs_inode_log_item;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot;
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE   32
-extern ktrace_t *xfs_ilock_trace_buf;
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define xfs_ilock_trace(i,n,f,ra)
-#endif
-typedef struct dm_attrs_s {
-        __uint32_t      da_dmevmask;    /* DMIG event mask */
-        __uint16_t      da_dmstate;     /* DMIG state info */
-        __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
 /*
 * This is the xfs in-core inode structure.
@@ -160,7 +121,7 @@ typedef struct xfs_ictimestamp {
 } xfs_ictimestamp_t;
 /*
- * NOTE:  This structure must be kept identical to struct xfs_dinode_core
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
 *        in xfs_dinode.h except for the endianess annotations.
 */
 typedef struct xfs_icdinode {
@@ -191,27 +152,97 @@ typedef struct xfs_icdinode {
        __uint32_t      di_gen;         /* generation number */
 } xfs_icdinode_t;
-typedef struct {
+/*
-        struct xfs_inode        *ip_mnext;      /* next inode in mount list */
+ * Flags for xfs_ichgtime().
-        struct xfs_inode        *ip_mprev;      /* ptr to prev inode */
+ */
-        struct xfs_mount        *ip_mount;      /* fs mount struct ptr */
+#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-} xfs_iptr_t;
+#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE    0x01    /* Inline data is read in */
+#define XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+/*
+ * Fork handling.
+ */
+#define XFS_IFORK_Q(ip)                 ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)              ((int)((ip)->i_d.di_forkoff << 3))
+#define XFS_IFORK_PTR(ip,w)             \
+        ((w) == XFS_DATA_FORK ? \
+                &(ip)->i_df : \
+                (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_IFORK_BOFF(ip) : \
+                XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+                0)
+#define XFS_IFORK_SIZE(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_IFORK_DSIZE(ip) : \
+                XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_format : \
+                (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_format = (n)) : \
+                ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_nextents : \
+                (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_nextents = (n)) : \
+                ((ip)->i_d.di_anextents = (n)))
+#ifdef __KERNEL__
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+#if defined(XFS_ILOCK_TRACE)
+#define XFS_ILOCK_KTRACE_SIZE   32
+extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
+#else
+#define xfs_ilock_trace(i,n,f,ra)
+#endif
+typedef struct dm_attrs_s {
+        __uint32_t      da_dmevmask;    /* DMIG event mask */
+        __uint16_t      da_dmstate;     /* DMIG state info */
+        __uint16_t      da_pad;         /* DMIG extra padding */
+} dm_attrs_t;
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
-        struct xfs_inode        *i_mnext;       /* next inode in mount list */
-        struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
-        struct list_head        i_reclaim;      /* reclaim list */
-        struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
        /* Inode location stuff */
        xfs_ino_t               i_ino;          /* inode number (agno/agino)*/
-        xfs_daddr_t             i_blkno;        /* blkno of inode buffer */
+        struct xfs_imap         i_imap;         /* location for xfs_imap() */
-        ushort                  i_len;          /* len of inode buffer */
-        ushort                  i_boffset;      /* off of inode in buffer */
        /* Extent information. */
        xfs_ifork_t             *i_afp;         /* attribute fork pointer */
@@ -230,7 +261,6 @@ typedef struct xfs_inode {
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned char           i_update_size;  /* di_size field is dirty */
-        unsigned int            i_gen;          /* generation count */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -238,6 +268,10 @@ typedef struct xfs_inode {
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
        atomic_t                i_iocount;      /* outstanding I/O count */
+        /* VFS inode */
+        struct inode            i_vnode;        /* embedded VFS inode */
        /* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
        struct ktrace           *i_trace;       /* general inode trace */
@@ -245,7 +279,7 @@ typedef struct xfs_inode {
 #ifdef XFS_BMAP_TRACE
        struct ktrace           *i_xtrace;      /* inode extent list trace */
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        struct ktrace           *i_btrace;      /* inode bmap btree trace */
 #endif
 #ifdef XFS_RW_TRACE
@@ -265,13 +299,30 @@ typedef struct xfs_inode {
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
-        return (struct xfs_inode *)inode->i_private;
+        return container_of(inode, struct xfs_inode, i_vnode);
 }
 /* convert from xfs inode to vfs inode */
 static inline struct inode *VFS_I(struct xfs_inode *ip)
 {
-        return (struct inode *)ip->i_vnode;
+        return &ip->i_vnode;
+}
+/*
+ * Get rid of a partially initialized inode.
+ *
+ * We have to go through destroy_inode to make sure allocations
+ * from init_inode_always like the security data are undone.
+ *
+ * We mark the inode bad so that it takes the short cut in
+ * the reclaim path instead of going through the flush path
+ * which doesn't make sense for an inode that has never seen the
+ * light of day.
+ */
+static inline void xfs_destroy_inode(struct xfs_inode *ip)
+{
+        make_bad_inode(VFS_I(ip));
+        return destroy_inode(VFS_I(ip));
 }
 /*
@@ -327,65 +378,36 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        spin_unlock(&ip->i_flags_lock);
        return ret;
 }
-#endif  /* __KERNEL__ */
 /*
- * Fork handling.
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
 */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+        wait_for_completion(&ip->i_flush);
+}
-#define XFS_IFORK_Q(ip)                 ((ip)->i_d.di_forkoff != 0)
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-#define XFS_IFORK_BOFF(ip)              ((int)((ip)->i_d.di_forkoff << 3))
+{
+        return try_wait_for_completion(&ip->i_flush);
-#define XFS_IFORK_PTR(ip,w)             \
+}
-        ((w) == XFS_DATA_FORK ? \
-                &(ip)->i_df : \
-                (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-        (XFS_IFORK_Q(ip) ? \
-                XFS_IFORK_BOFF(ip) : \
-                XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
-        (XFS_IFORK_Q(ip) ? \
-                XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
-                0)
-#define XFS_IFORK_SIZE(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                XFS_IFORK_DSIZE(ip) : \
-                XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                (ip)->i_d.di_format : \
-                (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-        ((w) == XFS_DATA_FORK ? \
-                ((ip)->i_d.di_format = (n)) : \
-                ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                (ip)->i_d.di_nextents : \
-                (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-        ((w) == XFS_DATA_FORK ? \
-                ((ip)->i_d.di_nextents = (n)) : \
-                ((ip)->i_d.di_anextents = (n)))
-#ifdef __KERNEL__
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+        complete(&ip->i_flush);
+}
 /*
 * In-core inode flags.
 */
-#define XFS_IGRIO       0x0001  /* inode used for guaranteed rate i/o */
+#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
-#define XFS_IUIOSZ      0x0002  /* inode i/o sizes have been explicitly set */
+#define XFS_ISTALE      0x0002  /* inode has been staled */
-#define XFS_IQUIESCE    0x0004  /* we have started quiescing for this inode */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_IRECLAIM    0x0008  /* we have started reclaiming this inode    */
+#define XFS_INEW        0x0008  /* inode has just been allocated */
-#define XFS_ISTALE      0x0010  /* inode has been staled */
+#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
-#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
+#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
-#define XFS_INEW        0x0040
-#define XFS_IFILESTREAM 0x0080  /* inode is in a filestream directory */
-#define XFS_IMODIFIED   0x0100  /* XFS inode state possibly differs */
-                                /* to the Linux inode state. */
-#define XFS_ITRUNCATED  0x0200  /* truncated down so flush-on-close */
 /*
 * Flags for inode locking.
@@ -460,16 +482,8 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
         ((pip)->i_d.di_mode & S_ISGID))
 /*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE         0x1
-#define XFS_IGET_BULKSTAT       0x2
-/*
 * xfs_iget.c prototypes.
 */
-void            xfs_ihash_init(struct xfs_mount *);
-void            xfs_ihash_free(struct xfs_mount *);
 xfs_inode_t     *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
                                  struct xfs_trans *);
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
@@ -484,25 +498,13 @@ int		xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
 void            xfs_ireclaim(xfs_inode_t *);
-int             xfs_finish_reclaim(xfs_inode_t *, int, int);
-int             xfs_finish_reclaim_all(struct xfs_mount *, int);
 /*
 * xfs_inode.c prototypes.
 */
-int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
-                          xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                          xfs_daddr_t, uint, uint);
-int             xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                          xfs_inode_t **, xfs_daddr_t, uint);
-int             xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
-                           xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t,
+                           xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
-void            xfs_dinode_from_disk(struct xfs_icdinode *,
-                                     struct xfs_dinode_core *);
-void            xfs_dinode_to_disk(struct xfs_dinode_core *,
-                                   struct xfs_icdinode *);
 uint            xfs_ip2xflags(struct xfs_inode *);
 uint            xfs_dic2xflags(struct xfs_dinode *);
@@ -513,17 +515,10 @@ int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
                                     xfs_fsize_t, int, int);
 int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-void            xfs_idestroy_fork(xfs_inode_t *, int);
-void            xfs_idestroy(xfs_inode_t *);
-void            xfs_idata_realloc(xfs_inode_t *, int, int);
-void            xfs_iextract(xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
-void            xfs_iroot_realloc(xfs_inode_t *, int, int);
 void            xfs_ipin(xfs_inode_t *);
 void            xfs_iunpin(xfs_inode_t *);
-int             xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
 int             xfs_iflush(xfs_inode_t *, uint);
-void            xfs_iflush_all(struct xfs_mount *);
 void            xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
@@ -532,6 +527,77 @@ void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void            xfs_synchronize_atime(xfs_inode_t *);
 void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
+#if defined(XFS_INODE_TRACE)
+#define INODE_TRACE_SIZE        16              /* number of trace entries */
+#define INODE_KTRACE_ENTRY      1
+#define INODE_KTRACE_EXIT       2
+#define INODE_KTRACE_HOLD       3
+#define INODE_KTRACE_REF        4
+#define INODE_KTRACE_RELE       5
+extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
+extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
+extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
+extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
+extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
+#define xfs_itrace_entry(ip)    \
+        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit(ip)     \
+        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit_tag(ip, tag)    \
+        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
+#define xfs_itrace_ref(ip)      \
+        _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
+#else
+#define xfs_itrace_entry(a)
+#define xfs_itrace_exit(a)
+#define xfs_itrace_exit_tag(a, b)
+#define xfs_itrace_hold(a, b, c, d)
+#define xfs_itrace_ref(a)
+#define xfs_itrace_rele(a, b, c, d)
+#endif
+#define IHOLD(ip) \
+do { \
+        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+        atomic_inc(&(VFS_I(ip)->i_count)); \
+        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
+#define IRELE(ip) \
+do { \
+        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+        iput(VFS_I(ip)); \
+} while (0)
+#endif /* __KERNEL__ */
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE         0x1
+#define XFS_IGET_BULKSTAT       0x2
+int             xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+                            xfs_ino_t, struct xfs_dinode **,
+                            struct xfs_buf **, int *, uint);
+int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+                          struct xfs_inode *, struct xfs_dinode **,
+                          struct xfs_buf **, uint);
+int             xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                          struct xfs_inode *, xfs_daddr_t, uint);
+void            xfs_dinode_from_disk(struct xfs_icdinode *,
+                                     struct xfs_dinode *);
+void            xfs_dinode_to_disk(struct xfs_dinode *,
+                                   struct xfs_icdinode *);
+void            xfs_idestroy_fork(struct xfs_inode *, int);
+void            xfs_idata_realloc(struct xfs_inode *, int, int);
+void            xfs_iroot_realloc(struct xfs_inode *, int, int);
+int             xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int             xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
 void            xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
                                xfs_bmbt_irec_t *);
@@ -561,7 +627,8 @@ void		xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
 #define xfs_ipincount(ip)       ((unsigned int) atomic_read(&ip->i_pincount))
 #ifdef DEBUG
-void            xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+void            xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+                                xfs_fsize_t);
 #else   /* DEBUG */
 #define xfs_isize_check(mp, ip, isize)
 #endif  /* DEBUG */
@@ -576,26 +643,4 @@ extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-        wait_for_completion(&ip->i_flush);
-}
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-        return try_wait_for_completion(&ip->i_flush);
-}
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-        complete(&ip->i_flush);
-}
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 97c7452e262..977c4aec587 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -281,7 +281,7 @@ xfs_inode_item_format(
        xfs_mark_inode_dirty_sync(ip);
        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
-        vecp->i_len  = sizeof(xfs_dinode_core_t);
+        vecp->i_len  = sizeof(struct xfs_icdinode);
        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
        vecp++;
        nvecs++;
@@ -296,9 +296,8 @@ xfs_inode_item_format(
         * has a new version number, then we don't bother converting back.
         */
        mp = ip->i_mount;
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-               xfs_sb_version_hasnlink(&mp->m_sb));
+        if (ip->i_d.di_version == 1) {
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
@@ -311,7 +310,7 @@ xfs_inode_item_format(
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        ip->i_d.di_version = 2;
                        ip->i_d.di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
                }
@@ -932,6 +931,7 @@ xfs_inode_item_init(
        iip->ili_item.li_type = XFS_LI_INODE;
        iip->ili_item.li_ops = &xfs_inode_item_ops;
        iip->ili_item.li_mountp = mp;
+        iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
        /*
@@ -942,9 +942,9 @@ xfs_inode_item_init(
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
-        iip->ili_format.ilf_blkno = ip->i_blkno;
+        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
-        iip->ili_format.ilf_len = ip->i_len;
+        iip->ili_format.ilf_len = ip->i_imap.im_len;
-        iip->ili_format.ilf_boffset = ip->i_boffset;
+        iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
 }
 /*
@@ -976,9 +976,8 @@ xfs_iflush_done(
        xfs_buf_t               *bp,
        xfs_inode_log_item_t    *iip)
 {
-        xfs_inode_t     *ip;
+        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_ail          *ailp = iip->ili_item.li_ailp;
-        ip = iip->ili_inode;
        /*
         * We only want to pull the item from the AIL if it is
@@ -991,15 +990,12 @@ xfs_iflush_done(
         */
        if (iip->ili_logged &&
            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
-                spin_lock(&ip->i_mount->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
-                        /*
+                        /* xfs_trans_ail_delete() drops the AIL lock. */
-                         * xfs_trans_delete_ail() drops the AIL lock.
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
-                         */
-                        xfs_trans_delete_ail(ip->i_mount,
-                                             (xfs_log_item_t*)iip);
                } else {
-                        spin_unlock(&ip->i_mount->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
        }
@@ -1031,21 +1027,20 @@ void
 xfs_iflush_abort(
        xfs_inode_t             *ip)
 {
-        xfs_inode_log_item_t    *iip;
+        xfs_inode_log_item_t    *iip = ip->i_itemp;
        xfs_mount_t             *mp;
        iip = ip->i_itemp;
        mp = ip->i_mount;
        if (iip) {
+                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&mp->m_ail_lock);
+                        spin_lock(&ailp->xa_lock);
                        if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                                /*
+                                /* xfs_trans_ail_delete() drops the AIL lock. */
-                                 * xfs_trans_delete_ail() drops the AIL lock.
+                                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
-                                 */
-                                xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip);
                        } else
-                                spin_unlock(&mp->m_ail_lock);
+                                spin_unlock(&ailp->xa_lock);
                }
                iip->ili_logged = 0;
                /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 40513077ab3..1ff04cc323a 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 {
 #define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
+#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
+static inline int xfs_ilog_fbroot(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
+static inline int xfs_ilog_fext(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
+static inline int xfs_ilog_fdata(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
 #ifdef __KERNEL__
 struct xfs_buf;
@@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item {
 } xfs_inode_log_item_t;
-#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
-static inline int xfs_ilog_fdata(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-#endif  /* __KERNEL__ */
-#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
-static inline int xfs_ilog_fbroot(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
-static inline int xfs_ilog_fext(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
        return (!ip->i_itemp ||
@@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
               !ip->i_update_core;
 }
-#ifdef __KERNEL__
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67f22b2b44b..911062cf73a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -290,7 +290,6 @@ STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-        xfs_fsize_t     isize,
        xfs_extlen_t    extsize,
        xfs_fileoff_t   *last_fsb)
 {
@@ -306,14 +305,14 @@ xfs_iomap_eof_align_last_fsb(
         * stripe width and we are allocating past the allocation eof.
         */
        else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-                (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+                (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
        /*
         * Roundup the allocation request to a stripe unit (m_dalign) boundary
         * if the file size is >= stripe unit size, and we are allocating past
         * the allocation eof.
         */
-        else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+        else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
        /*
@@ -403,7 +402,6 @@ xfs_iomap_write_direct(
        xfs_filblks_t   count_fsb, resaligned;
        xfs_fsblock_t   firstfsb;
        xfs_extlen_t    extsz, temp;
-        xfs_fsize_t     isize;
        int             nimaps;
        int             bmapi_flag;
        int             quota_flag;
@@ -426,15 +424,10 @@ xfs_iomap_write_direct(
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
-        isize = ip->i_size;
-        if (ip->i_new_size > isize)
-                isize = ip->i_new_size;
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-        if ((offset + count) > isize) {
+        if ((offset + count) > ip->i_size) {
-                error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
+                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-                                                        &last_fsb);
                if (error)
                        goto error_out;
        } else {
@@ -559,7 +552,6 @@ STATIC int
 xfs_iomap_eof_want_preallocate(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-        xfs_fsize_t     isize,
        xfs_off_t       offset,
        size_t          count,
        int             ioflag,
@@ -573,7 +565,7 @@ xfs_iomap_eof_want_preallocate(
        int             n, error, imaps;
        *prealloc = 0;
-        if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize)
+        if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
                return 0;
        /*
@@ -617,7 +609,6 @@ xfs_iomap_write_delay(
        xfs_fileoff_t   ioalign;
        xfs_fsblock_t   firstblock;
        xfs_extlen_t    extsz;
-        xfs_fsize_t     isize;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
        int             prealloc, fsynced = 0;
@@ -637,11 +628,7 @@ xfs_iomap_write_delay(
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 retry:
-        isize = ip->i_size;
+        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-        if (ip->i_new_size > isize)
-                isize = ip->i_new_size;
-        error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count,
                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
@@ -655,8 +642,7 @@ retry:
        }
        if (prealloc || extsz) {
-                error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
+                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-                                                        &last_fsb);
                if (error)
                        return error;
        }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf6754a3c5b..e19d0a8d561 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -69,7 +69,7 @@ xfs_bulkstat_one_iget(
        }
        ASSERT(ip != NULL);
-        ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+        ASSERT(ip->i_imap.im_blkno != 0);
        dic = &ip->i_d;
@@ -125,13 +125,9 @@ STATIC void
 xfs_bulkstat_one_dinode(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
-        xfs_dinode_t    *dip,           /* dinode inode pointer */
+        xfs_dinode_t    *dic,           /* dinode inode pointer */
        xfs_bstat_t     *buf)           /* return buffer */
 {
-        xfs_dinode_core_t *dic;         /* dinode core info pointer */
-        dic = &dip->di_core;
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -143,7 +139,7 @@ xfs_bulkstat_one_dinode(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-        if (dic->di_version == XFS_DINODE_VERSION_1) {
+        if (dic->di_version == 1) {
                buf->bs_nlink = be16_to_cpu(dic->di_onlink);
                buf->bs_projid = 0;
        } else {
@@ -162,7 +158,7 @@ xfs_bulkstat_one_dinode(
        buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
        buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
        buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
-        buf->bs_xflags = xfs_dic2xflags(dip);
+        buf->bs_xflags = xfs_dic2xflags(dic);
        buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
        buf->bs_extents = be32_to_cpu(dic->di_nextents);
        buf->bs_gen = be32_to_cpu(dic->di_gen);
@@ -173,7 +169,7 @@ xfs_bulkstat_one_dinode(
        switch (dic->di_format) {
        case XFS_DINODE_FMT_DEV:
-                buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev);
+                buf->bs_rdev = xfs_dinode_get_rdev(dic);
                buf->bs_blksize = BLKDEV_IOSIZE;
                buf->bs_blocks = 0;
                break;
@@ -192,27 +188,34 @@ xfs_bulkstat_one_dinode(
        }
 }
+/* Return 0 on success or positive error */
 STATIC int
 xfs_bulkstat_one_fmt(
        void                    __user *ubuffer,
+        int                     ubsize,
+        int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
+        if (ubsize < sizeof(*buffer))
+                return XFS_ERROR(ENOMEM);
        if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-                return -EFAULT;
+                return XFS_ERROR(EFAULT);
-        return sizeof(*buffer);
+        if (ubused)
+                *ubused = sizeof(*buffer);
+        return 0;
 }
 /*
 * Return stat information for one inode.
 * Return 0 if ok, else errno.
 */
-int                             /* error status */
+int                                     /* error status */
-xfs_bulkstat_one(
+xfs_bulkstat_one_int(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* buffer to place output in */
        int             ubsize,         /* size of buffer */
-        void            *private_data,  /* my private data */
+        bulkstat_one_fmt_pf formatter,  /* formatter, copy to user */
        xfs_daddr_t     bno,            /* starting bno of inode cluster */
        int             *ubused,        /* bytes used by me */
        void            *dibuff,        /* on-disk inode buffer */
@@ -221,15 +224,12 @@ xfs_bulkstat_one(
        xfs_bstat_t     *buf;           /* return buffer */
        int             error = 0;      /* error value */
        xfs_dinode_t    *dip;           /* dinode inode pointer */
-        bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
        dip = (xfs_dinode_t *)dibuff;
        *stat = BULKSTAT_RV_NOTHING;
        if (!buffer || xfs_internal_inum(mp, ino))
                return XFS_ERROR(EINVAL);
-        if (ubsize < sizeof(*buf))
-                return XFS_ERROR(ENOMEM);
        buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
@@ -244,21 +244,34 @@ xfs_bulkstat_one(
                xfs_bulkstat_one_dinode(mp, ino, dip, buf);
        }
-        error = formatter(buffer, buf);
+        error = formatter(buffer, ubsize, ubused, buf);
-        if (error < 0)  {
+        if (error)
-                error = EFAULT;
                goto out_free;
-        }
        *stat = BULKSTAT_RV_DIDONE;
-        if (ubused)
-                *ubused = error;
 out_free:
        kmem_free(buf);
        return error;
 }
+int
+xfs_bulkstat_one(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        void            __user *buffer, /* buffer to place output in */
+        int             ubsize,         /* size of buffer */
+        void            *private_data,  /* my private data */
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        int             *ubused,        /* bytes used by me */
+        void            *dibuff,        /* on-disk inode buffer */
+        int             *stat)          /* BULKSTAT_RV_... */
+{
+        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+                                    xfs_bulkstat_one_fmt, bno,
+                                    ubused, dibuff, stat);
+}
 /*
 * Test to see whether we can use the ondisk inode directly, based
 * on the given bulkstat flags, filling in dipp accordingly.
@@ -287,19 +300,19 @@ xfs_bulkstat_use_dinode(
         * to disk yet. This is a temporary hack that would require a proper
         * fix in the future.
         */
-        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC ||
+        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-            !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) ||
+            !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
-            !dip->di_core.di_mode)
+            !dip->di_mode)
                return 0;
        if (flags & BULKSTAT_FG_QUICK) {
                *dipp = dip;
                return 1;
        }
        /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
-        aformat = dip->di_core.di_aformat;
+        aformat = dip->di_aformat;
        if ((XFS_DFORK_Q(dip) == 0) ||
            (aformat == XFS_DINODE_FMT_LOCAL) ||
-            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) {
+            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
                *dipp = dip;
                return 1;
        }
@@ -359,7 +372,6 @@ xfs_bulkstat(
        int                     ubused; /* bytes used by formatter */
        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
        xfs_dinode_t            *dip;   /* ptr into bp for specific inode */
-        xfs_inode_t             *ip;    /* ptr to in-core inode struct */
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -416,8 +428,7 @@ xfs_bulkstat(
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
-                cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
+                cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                                                (xfs_inode_t *)0, 0);
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
@@ -472,7 +483,7 @@ xfs_bulkstat(
                         * In any case, increment to the next record.
                         */
                        if (!error)
-                                error = xfs_inobt_increment(cur, 0, &tmp);
+                                error = xfs_btree_increment(cur, 0, &tmp);
                } else {
                        /*
                         * Start of ag.  Lookup the first inode chunk.
@@ -539,7 +550,7 @@ xfs_bulkstat(
                         * Set agino to after this chunk and bump the cursor.
                         */
                        agino = gino + XFS_INODES_PER_CHUNK;
-                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &tmp);
                        cond_resched();
                }
                /*
@@ -586,6 +597,8 @@ xfs_bulkstat(
                                        if (flags & (BULKSTAT_FG_QUICK |
                                                     BULKSTAT_FG_INLINE)) {
+                                                int offset;
                                                ino = XFS_AGINO_TO_INO(mp, agno,
                                                                       agino);
                                                bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -594,21 +607,15 @@ xfs_bulkstat(
                                                /*
                                                 * Get the inode cluster buffer
                                                 */
-                                                ASSERT(xfs_inode_zone != NULL);
-                                                ip = kmem_zone_zalloc(xfs_inode_zone,
-                                                                      KM_SLEEP);
-                                                ip->i_ino = ino;
-                                                ip->i_mount = mp;
-                                                spin_lock_init(&ip->i_flags_lock);
                                                if (bp)
                                                        xfs_buf_relse(bp);
-                                                error = xfs_itobp(mp, NULL, ip,
-                                                                &dip, &bp, bno,
+                                                error = xfs_inotobp(mp, NULL, ino, &dip,
-                                                                XFS_IMAP_BULKSTAT,
+                                                                    &bp, &offset,
-                                                                XFS_BUF_LOCK);
+                                                                    XFS_IGET_BULKSTAT);
                                                if (!error)
-                                                        clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
+                                                        clustidx = offset / mp->m_sb.sb_inodesize;
-                                                kmem_zone_free(xfs_inode_zone, ip);
                                                if (XFS_TEST_ERROR(error != 0,
                                                                   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
                                                                   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
@@ -842,8 +849,7 @@ xfs_inumbers(
                                agino = 0;
                                continue;
                        }
-                        cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
+                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                                XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -887,7 +893,7 @@ xfs_inumbers(
                        bufidx = 0;
                }
                if (left) {
-                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                                cur = NULL;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index a1f18fce9b7..1fb04e7deb6 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -71,9 +71,23 @@ xfs_bulkstat_single(
 typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
        void                    __user *ubuffer, /* buffer to write to */
+        int                     ubsize,          /* remaining user buffer sz */
+        int                     *ubused,         /* bytes used by formatter */
        const xfs_bstat_t       *buffer);        /* buffer to read from */
 int
+xfs_bulkstat_one_int(
+        xfs_mount_t             *mp,
+        xfs_ino_t               ino,
+        void                    __user *buffer,
+        int                     ubsize,
+        bulkstat_one_fmt_pf     formatter,
+        xfs_daddr_t             bno,
+        int                     *ubused,
+        void                    *dibuff,
+        int                     *stat);
+int
 xfs_bulkstat_one(
        xfs_mount_t             *mp,
        xfs_ino_t               ino,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3608a0f0a5f..f4726f702a9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -100,12 +100,11 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 /* local ticket functions */
-STATIC xlog_ticket_t    *xlog_ticket_get(xlog_t *log,
+STATIC xlog_ticket_t    *xlog_ticket_alloc(xlog_t *log,
                                         int    unit_bytes,
                                         int    count,
                                         char   clientid,
                                         uint   flags);
-STATIC void             xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
@@ -360,7 +359,7 @@ xfs_log_done(xfs_mount_t	*mp,
                 */
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
-                xlog_ticket_put(log, ticket);
+                xfs_log_ticket_put(ticket);
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
@@ -514,7 +513,7 @@ xfs_log_reserve(xfs_mount_t	 *mp,
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
-                internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
                                                  client, flags);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
@@ -572,12 +571,12 @@ xfs_log_mount(
        /*
         * Initialize the AIL now we have a log.
         */
-        spin_lock_init(&mp->m_ail_lock);
        error = xfs_trans_ail_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
                goto error;
        }
+        mp->m_log->l_ailp = mp->m_ail;
        /*
         * skip log recovery on a norecovery mount.  pretend it all
@@ -730,8 +729,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
                error = xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -749,7 +748,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (tic) {
                        xlog_trace_loggrant(log, tic, "unmount rec");
                        xlog_ungrant_log_space(log, tic);
-                        xlog_ticket_put(log, tic);
+                        xfs_log_ticket_put(tic);
                }
        } else {
                /*
@@ -768,9 +767,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
                error =  xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -906,7 +905,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
-        int             needed = 0, gen;
+        int             needed = 0;
        xlog_t          *log = mp->m_log;
        if (!xfs_fs_writable(mp))
@@ -915,7 +914,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        spin_lock(&log->l_icloglock);
        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
-                        && !xfs_trans_first_ail(mp, &gen)
+                        && !xfs_trans_ail_tail(log->l_ailp)
                        && xlog_iclogs_empty(log)) {
                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                        log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -952,7 +951,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
        xfs_lsn_t tail_lsn;
        xlog_t    *log = mp->m_log;
-        tail_lsn = xfs_trans_tail_ail(mp);
+        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
        spin_lock(&log->l_grant_lock);
        if (tail_lsn != 0) {
                log->l_tail_lsn = tail_lsn;
@@ -1030,12 +1029,6 @@ xlog_iodone(xfs_buf_t *bp)
        ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
        aborted = 0;
-        /*
-         * Some versions of cpp barf on the recursive definition of
-         * ic_log -> hic_fields.ic_log and expand ic_log twice when
-         * it is passed through two macros.  Workaround broken cpp.
-         */
        l = iclog->ic_log;
        /*
@@ -1302,7 +1295,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
                XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
                iclog->ic_bp = bp;
-                iclog->hic_data = bp->b_addr;
+                iclog->ic_data = bp->b_addr;
 #ifdef DEBUG
                log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
 #endif
@@ -1322,7 +1315,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
                atomic_set(&iclog->ic_refcnt, 0);
                spin_lock_init(&iclog->ic_callback_lock);
                iclog->ic_callback_tail = &(iclog->ic_callback);
-                iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+                iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
@@ -1446,7 +1439,7 @@ xlog_grant_push_ail(xfs_mount_t	*mp,
     */
    if (threshold_lsn &&
        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_push_ail(mp, threshold_lsn);
+            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }       /* xlog_grant_push_ail */
@@ -1991,7 +1984,9 @@ xlog_write(xfs_mount_t *	mp,
                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
                    record_cnt = data_cnt = 0;
+                    spin_lock(&log->l_icloglock);
                    xlog_state_want_sync(log, iclog);
+                    spin_unlock(&log->l_icloglock);
                    if (commit_iclog) {
                        ASSERT(flags & XLOG_COMMIT_TRANS);
                        *commit_iclog = iclog;
@@ -3200,7 +3195,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-        spin_lock(&log->l_icloglock);
+        ASSERT(spin_is_locked(&log->l_icloglock));
        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
                xlog_state_switch_iclogs(log, iclog, 0);
@@ -3208,10 +3203,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
                ASSERT(iclog->ic_state &
                        (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
        }
+}
-        spin_unlock(&log->l_icloglock);
-}       /* xlog_state_want_sync */
 /*****************************************************************************
@@ -3222,22 +3214,33 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 */
 /*
- * Free a used ticket.
+ * Free a used ticket when it's refcount falls to zero.
 */
-STATIC void
+void
-xlog_ticket_put(xlog_t          *log,
+xfs_log_ticket_put(
-                xlog_ticket_t   *ticket)
+        xlog_ticket_t   *ticket)
 {
-        sv_destroy(&ticket->t_wait);
+        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        kmem_zone_free(xfs_log_ticket_zone, ticket);
+        if (atomic_dec_and_test(&ticket->t_ref)) {
-}       /* xlog_ticket_put */
+                sv_destroy(&ticket->t_wait);
+                kmem_zone_free(xfs_log_ticket_zone, ticket);
+        }
+}
+xlog_ticket_t *
+xfs_log_ticket_get(
+        xlog_ticket_t   *ticket)
+{
+        ASSERT(atomic_read(&ticket->t_ref) > 0);
+        atomic_inc(&ticket->t_ref);
+        return ticket;
+}
 /*
 * Allocate and initialise a new log ticket.
 */
 STATIC xlog_ticket_t *
-xlog_ticket_get(xlog_t          *log,
+xlog_ticket_alloc(xlog_t                *log,
                int             unit_bytes,
                int             cnt,
                char            client,
@@ -3308,6 +3311,7 @@ xlog_ticket_get(xlog_t		*log,
                unit_bytes += 2*BBSIZE;
        }
+        atomic_set(&tic->t_ref, 1);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3323,7 +3327,7 @@ xlog_ticket_get(xlog_t		*log,
        xlog_tic_reset_res(tic);
        return tic;
-}       /* xlog_ticket_get */
+}
 /******************************************************************************
@@ -3452,7 +3456,7 @@ xlog_verify_iclog(xlog_t	 *log,
        ptr = iclog->ic_datap;
        base_ptr = ptr;
        ophead = (xlog_op_header_t *)ptr;
-        xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+        xhdr = iclog->ic_data;
        for (i = 0; i < len; i++) {
                ophead = (xlog_op_header_t *)ptr;
@@ -3558,7 +3562,8 @@ xfs_log_force_umount(
        if (!log ||
            log->l_flags & XLOG_ACTIVE_RECOVERY) {
                mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-                XFS_BUF_DONE(mp->m_sb_bp);
+                if (mp->m_sb_bp)
+                        XFS_BUF_DONE(mp->m_sb_bp);
                return 0;
        }
@@ -3579,7 +3584,9 @@ xfs_log_force_umount(
        spin_lock(&log->l_icloglock);
        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-        XFS_BUF_DONE(mp->m_sb_bp);
+        if (mp->m_sb_bp)
+                XFS_BUF_DONE(mp->m_sb_bp);
        /*
         * This flag is sort of redundant because of the mount flag, but
         * it's good to maintain the separation between the log and the rest
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d47b91f1082..8a3e84e900a 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -134,6 +134,7 @@ typedef struct xfs_log_callback {
 #ifdef __KERNEL__
 /* Log manager interfaces */
 struct xfs_mount;
+struct xlog_ticket;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       xfs_log_ticket_t ticket,
                       void             **iclog,
@@ -177,6 +178,9 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 void      xlog_iodone(struct xfs_buf *);
+struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+void      xfs_log_ticket_put(struct xlog_ticket *ticket);
 #endif
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e7d8f84443f..654167be0ef 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -245,6 +245,7 @@ typedef struct xlog_ticket {
        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
+        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
        int                t_unit_res;   /* unit reservation in bytes    : 4  */
        char               t_ocnt;       /* original count               : 1  */
@@ -309,6 +310,16 @@ typedef struct xlog_rec_ext_header {
 } xlog_rec_ext_header_t;
 #ifdef __KERNEL__
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+        xlog_rec_header_t       hic_header;
+        xlog_rec_ext_header_t   hic_xheader;
+        char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
 /*
 * - A log record header is 512 bytes.  There is plenty of room to grow the
 *      xlog_rec_header_t into the reserved space.
@@ -338,7 +349,7 @@ typedef struct xlog_rec_ext_header {
 * We'll put all the read-only and l_icloglock fields in the first cacheline,
 * and move everything else out to subsequent cachelines.
 */
-typedef struct xlog_iclog_fields {
+typedef struct xlog_in_core {
        sv_t                    ic_force_wait;
        sv_t                    ic_write_wait;
        struct xlog_in_core     *ic_next;
@@ -361,41 +372,11 @@ typedef struct xlog_iclog_fields {
        /* reference counts need their own cacheline */
        atomic_t                ic_refcnt ____cacheline_aligned_in_smp;
-} xlog_iclog_fields_t;
+        xlog_in_core_2_t        *ic_data;
+#define ic_header       ic_data->hic_header
-typedef union xlog_in_core2 {
-        xlog_rec_header_t       hic_header;
-        xlog_rec_ext_header_t   hic_xheader;
-        char                    hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-typedef struct xlog_in_core {
-        xlog_iclog_fields_t     hic_fields;
-        xlog_in_core_2_t        *hic_data;
 } xlog_in_core_t;
 /*
- * Defines to save our code from this glop.
- */
-#define ic_force_wait   hic_fields.ic_force_wait
-#define ic_write_wait   hic_fields.ic_write_wait
-#define ic_next         hic_fields.ic_next
-#define ic_prev         hic_fields.ic_prev
-#define ic_bp           hic_fields.ic_bp
-#define ic_log          hic_fields.ic_log
-#define ic_callback     hic_fields.ic_callback
-#define ic_callback_lock hic_fields.ic_callback_lock
-#define ic_callback_tail hic_fields.ic_callback_tail
-#define ic_trace        hic_fields.ic_trace
-#define ic_size         hic_fields.ic_size
-#define ic_offset       hic_fields.ic_offset
-#define ic_refcnt       hic_fields.ic_refcnt
-#define ic_bwritecnt    hic_fields.ic_bwritecnt
-#define ic_state        hic_fields.ic_state
-#define ic_datap        hic_fields.ic_datap
-#define ic_header       hic_data->hic_header
-/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -404,6 +385,7 @@ typedef struct xlog_in_core {
 typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
+        struct xfs_ail          *l_ailp;        /* AIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 70e3ba32e6b..35cca98bd94 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -36,7 +36,6 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_imap.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
@@ -54,10 +53,8 @@ STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
                                               xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void     xlog_recover_check_summary(xlog_t *);
-STATIC void     xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
 #else
 #define xlog_recover_check_summary(log)
-#define xlog_recover_check_ail(mp, lip, gen)
 #endif
@@ -270,21 +267,16 @@ STATIC void
 xlog_recover_iodone(
        struct xfs_buf  *bp)
 {
-        xfs_mount_t     *mp;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
        if (XFS_BUF_GETERROR(bp)) {
                /*
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-                mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
                xfs_ioerror_alert("xlog_recover_iodone",
-                                  mp, bp, XFS_BUF_ADDR(bp));
+                                  bp->b_mount, bp, XFS_BUF_ADDR(bp));
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
        }
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+        bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_biodone(bp);
 }
@@ -2228,9 +2220,8 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                bp->b_mount = mp;
-                XFS_BUF_SET_FSPRIVATE(bp, mp);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2247,7 +2238,6 @@ xlog_recover_do_inode_trans(
        xfs_inode_log_format_t  *in_f;
        xfs_mount_t             *mp;
        xfs_buf_t               *bp;
-        xfs_imap_t              imap;
        xfs_dinode_t            *dip;
        xfs_ino_t               ino;
        int                     len;
@@ -2275,54 +2265,35 @@ xlog_recover_do_inode_trans(
        }
        ino = in_f->ilf_ino;
        mp = log->l_mp;
-        if (ITEM_TYPE(item) == XFS_LI_INODE) {
-                imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
-                imap.im_len = in_f->ilf_len;
-                imap.im_boffset = in_f->ilf_boffset;
-        } else {
-                /*
-                 * It's an old inode format record.  We don't know where
-                 * its cluster is located on disk, and we can't allow
-                 * xfs_imap() to figure it out because the inode btrees
-                 * are not ready to be used.  Therefore do not pass the
-                 * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
-                 * us only the single block in which the inode lives
-                 * rather than its cluster, so we must make sure to
-                 * invalidate the buffer when we write it out below.
-                 */
-                imap.im_blkno = 0;
-                error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
-                if (error)
-                        goto error;
-        }
        /*
         * Inode buffers can be freed, look out for it,
         * and do not replay the inode.
         */
-        if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
+        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+                                        in_f->ilf_len, 0)) {
                error = 0;
                goto error;
        }
-        bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
+        bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
-                                                                XFS_BUF_LOCK);
+                                in_f->ilf_len, XFS_BUF_LOCK);
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
-                                  bp, imap.im_blkno);
+                                  bp, in_f->ilf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                goto error;
        }
        error = 0;
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
        /*
         * Make sure the place we're flushing out to really looks
         * like an inode!
         */
-        if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) {
+        if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
@@ -2345,12 +2316,12 @@ xlog_recover_do_inode_trans(
        }
        /* Skip replay when the on disk inode is newer than the log one */
-        if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) {
+        if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
                /*
                 * Deal with the wrap case, DI_MAX_FLUSH is less
                 * than smaller numbers
                 */
-                if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH &&
+                if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
                        /* do nothing */
                } else {
@@ -2410,7 +2381,7 @@ xlog_recover_do_inode_trans(
                error = EFSCORRUPTED;
                goto error;
        }
-        if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
+        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
@@ -2422,23 +2393,24 @@ xlog_recover_do_inode_trans(
        }
        /* The core is in in-core format */
-        xfs_dinode_to_disk(&dip->di_core,
+        xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
-                (xfs_icdinode_t *)item->ri_buf[1].i_addr);
        /* the rest is in on-disk format */
-        if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+        if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
-                memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
+                memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
-                        item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
+                        item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
-                        item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
+                        item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
        }
        fields = in_f->ilf_fields;
        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
        case XFS_ILOG_DEV:
-                dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev);
+                xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
                break;
        case XFS_ILOG_UUID:
-                dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+                memcpy(XFS_DFORK_DPTR(dip),
+                       &in_f->ilf_u.ilfu_uuid,
+                       sizeof(uuid_t));
                break;
        }
@@ -2454,12 +2426,12 @@ xlog_recover_do_inode_trans(
        switch (fields & XFS_ILOG_DFORK) {
        case XFS_ILOG_DDATA:
        case XFS_ILOG_DEXT:
-                memcpy(&dip->di_u, src, len);
+                memcpy(XFS_DFORK_DPTR(dip), src, len);
                break;
        case XFS_ILOG_DBROOT:
-                xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
-                                 &(dip->di_u.di_bmbt),
+                                 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
                                 XFS_DFORK_DSIZE(dip, mp));
                break;
@@ -2496,8 +2468,8 @@ xlog_recover_do_inode_trans(
                case XFS_ILOG_ABROOT:
                        dest = XFS_DFORK_APTR(dip);
-                        xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                        xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
-                                         (xfs_bmdr_block_t*)dest,
+                                         len, (xfs_bmdr_block_t*)dest,
                                         XFS_DFORK_ASIZE(dip, mp));
                        break;
@@ -2512,9 +2484,8 @@ xlog_recover_do_inode_trans(
 write_inode_buffer:
        if (ITEM_TYPE(item) == XFS_LI_INODE) {
-                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                bp->b_mount = mp;
-                XFS_BUF_SET_FSPRIVATE(bp, mp);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        } else {
@@ -2645,9 +2616,8 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        ASSERT(dq_f->qlf_size == 2);
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-               XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+        bp->b_mount = mp;
-        XFS_BUF_SET_FSPRIVATE(bp, mp);
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
@@ -2689,11 +2659,11 @@ xlog_recover_do_efi_trans(
        efip->efi_next_extent = efi_formatp->efi_nextents;
        efip->efi_flags |= XFS_EFI_COMMITTED;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&log->l_ailp->xa_lock);
        /*
-         * xfs_trans_update_ail() drops the AIL lock.
+         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
        return 0;
 }
@@ -2712,12 +2682,12 @@ xlog_recover_do_efd_trans(
        xlog_recover_item_t     *item,
        int                     pass)
 {
-        xfs_mount_t             *mp;
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
        xfs_log_item_t          *lip;
-        int                     gen;
        __uint64_t              efi_id;
+        struct xfs_ail_cursor   cur;
+        struct xfs_ail          *ailp = log->l_ailp;
        if (pass == XLOG_RECOVER_PASS1) {
                return;
@@ -2734,25 +2704,26 @@ xlog_recover_do_efd_trans(
         * Search for the efi with the id in the efd format structure
         * in the AIL.
         */
-        mp = log->l_mp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-        lip = xfs_trans_first_ail(mp, &gen);
        while (lip != NULL) {
                if (lip->li_type == XFS_LI_EFI) {
                        efip = (xfs_efi_log_item_t *)lip;
                        if (efip->efi_format.efi_id == efi_id) {
                                /*
-                                 * xfs_trans_delete_ail() drops the
+                                 * xfs_trans_ail_delete() drops the
                                 * AIL lock.
                                 */
-                                xfs_trans_delete_ail(mp, lip);
+                                xfs_trans_ail_delete(ailp, lip);
                                xfs_efi_item_free(efip);
-                                return;
+                                spin_lock(&ailp->xa_lock);
+                                break;
                        }
                }
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-        spin_unlock(&mp->m_ail_lock);
+        xfs_trans_ail_cursor_done(ailp, &cur);
+        spin_unlock(&ailp->xa_lock);
 }
 /*
@@ -3036,33 +3007,6 @@ abort_error:
 }
 /*
- * Verify that once we've encountered something other than an EFI
- * in the AIL that there are no more EFIs in the AIL.
- */
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_ail(
-        xfs_mount_t             *mp,
-        xfs_log_item_t          *lip,
-        int                     gen)
-{
-        int                     orig_gen = gen;
-        do {
-                ASSERT(lip->li_type != XFS_LI_EFI);
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
-                /*
-                 * The check will be bogus if we restart from the
-                 * beginning of the AIL, so ASSERT that we don't.
-                 * We never should since we're holding the AIL lock
-                 * the entire time.
-                 */
-                ASSERT(gen == orig_gen);
-        } while (lip != NULL);
-}
-#endif  /* DEBUG */
-/*
 * When this is called, all of the EFIs which did not have
 * corresponding EFDs should be in the AIL.  What we do now
 * is free the extents associated with each one.
@@ -3086,20 +3030,23 @@ xlog_recover_process_efis(
 {
        xfs_log_item_t          *lip;
        xfs_efi_log_item_t      *efip;
-        int                     gen;
-        xfs_mount_t             *mp;
        int                     error = 0;
+        struct xfs_ail_cursor   cur;
+        struct xfs_ail          *ailp;
-        mp = log->l_mp;
+        ailp = log->l_ailp;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-        lip = xfs_trans_first_ail(mp, &gen);
        while (lip != NULL) {
                /*
                 * We're done when we see something other than an EFI.
+                 * There should be no EFIs left in the AIL now.
                 */
                if (lip->li_type != XFS_LI_EFI) {
-                        xlog_recover_check_ail(mp, lip, gen);
+#ifdef DEBUG
+                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+                                ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
                        break;
                }
@@ -3108,18 +3055,20 @@ xlog_recover_process_efis(
                 */
                efip = (xfs_efi_log_item_t *)lip;
                if (efip->efi_flags & XFS_EFI_RECOVERED) {
-                        lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
-                error = xlog_recover_process_efi(mp, efip);
+                error = xlog_recover_process_efi(log->l_mp, efip);
+                spin_lock(&ailp->xa_lock);
                if (error)
-                        return error;
+                        goto out;
-                spin_lock(&mp->m_ail_lock);
+                lip = xfs_trans_ail_cursor_next(ailp, &cur);
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
        }
-        spin_unlock(&mp->m_ail_lock);
+out:
+        xfs_trans_ail_cursor_done(ailp, &cur);
+        spin_unlock(&ailp->xa_lock);
        return error;
 }
@@ -3140,19 +3089,16 @@ xlog_recover_clear_agi_bucket(
        int             error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
-        if (!error)
+                                  0, 0, 0);
-                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
        if (error)
                goto out_abort;
-        error = EINVAL;
+        error = xfs_read_agi(mp, tp, agno, &agibp);
-        agi = XFS_BUF_TO_AGI(agibp);
+        if (error)
-        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
                goto out_abort;
+        agi = XFS_BUF_TO_AGI(agibp);
        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
                 (sizeof(xfs_agino_t) * bucket);
@@ -3172,6 +3118,62 @@ out_error:
        return;
 }
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+        struct xfs_mount                *mp,
+        xfs_agnumber_t                  agno,
+        xfs_agino_t                     agino,
+        int                             bucket)
+{
+        struct xfs_buf                  *ibp;
+        struct xfs_dinode               *dip;
+        struct xfs_inode                *ip;
+        xfs_ino_t                       ino;
+        int                             error;
+        ino = XFS_AGINO_TO_INO(mp, agno, agino);
+        error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+        if (error)
+                goto fail;
+        /*
+         * Get the on disk inode to find the next inode in the bucket.
+         */
+        error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+        if (error)
+                goto fail_iput;
+        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(ip->i_d.di_mode != 0);
+        /* setup for the next pass */
+        agino = be32_to_cpu(dip->di_next_unlinked);
+        xfs_buf_relse(ibp);
+        /*
+         * Prevent any DMAPI event from being sent when the reference on
+         * the inode is dropped.
+         */
+        ip->i_d.di_dmevmask = 0;
+        IRELE(ip);
+        return agino;
+ fail_iput:
+        IRELE(ip);
+ fail:
+        /*
+         * We can't read in the inode this bucket points to, or this inode
+         * is messed up.  Just ditch this bucket of inodes.  We will lose
+         * some inodes and space, but at least we won't hang.
+         *
+         * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+         * clear the inode pointer in the bucket.
+         */
+        xlog_recover_clear_agi_bucket(mp, agno, bucket);
+        return NULLAGINO;
+}
 /*
 * xlog_iunlink_recover
 *
@@ -3192,11 +3194,7 @@ xlog_recover_process_iunlinks(
        xfs_agnumber_t  agno;
        xfs_agi_t       *agi;
        xfs_buf_t       *agibp;
-        xfs_buf_t       *ibp;
-        xfs_dinode_t    *dip;
-        xfs_inode_t     *ip;
        xfs_agino_t     agino;
-        xfs_ino_t       ino;
        int             bucket;
        int             error;
        uint            mp_dmevmask;
@@ -3213,22 +3211,21 @@ xlog_recover_process_iunlinks(
                /*
                 * Find the agi for this ag.
                 */
-                agibp = xfs_buf_read(mp->m_ddev_targp,
+                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                if (error) {
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                        /*
-                if (XFS_BUF_ISERROR(agibp)) {
+                         * AGI is b0rked. Don't process it.
-                        xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
+                         *
-                                log->l_mp, agibp,
+                         * We should probably mark the filesystem as corrupt
-                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
+                         * after we've recovered all the ag's we can....
+                         */
+                        continue;
                }
                agi = XFS_BUF_TO_AGI(agibp);
-                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
                        while (agino != NULLAGINO) {
                                /*
                                 * Release the agi buffer so that it can
                                 * be acquired in the normal course of the
@@ -3236,87 +3233,17 @@ xlog_recover_process_iunlinks(
                                 */
                                xfs_buf_relse(agibp);
-                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
+                                agino = xlog_recover_process_one_iunlink(mp,
-                                error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+                                                        agno, agino, bucket);
-                                ASSERT(error || (ip != NULL));
-                                if (!error) {
-                                        /*
-                                         * Get the on disk inode to find the
-                                         * next inode in the bucket.
-                                         */
-                                        error = xfs_itobp(mp, NULL, ip, &dip,
-                                                        &ibp, 0, 0,
-                                                        XFS_BUF_LOCK);
-                                        ASSERT(error || (dip != NULL));
-                                }
-                                if (!error) {
-                                        ASSERT(ip->i_d.di_nlink == 0);
-                                        /* setup for the next pass */
-                                        agino = be32_to_cpu(
-                                                        dip->di_next_unlinked);
-                                        xfs_buf_relse(ibp);
-                                        /*
-                                         * Prevent any DMAPI event from
-                                         * being sent when the
-                                         * reference on the inode is
-                                         * dropped.
-                                         */
-                                        ip->i_d.di_dmevmask = 0;
-                                        /*
-                                         * If this is a new inode, handle
-                                         * it specially.  Otherwise,
-                                         * just drop our reference to the
-                                         * inode.  If there are no
-                                         * other references, this will
-                                         * send the inode to
-                                         * xfs_inactive() which will
-                                         * truncate the file and free
-                                         * the inode.
-                                         */
-                                        if (ip->i_d.di_mode == 0)
-                                                xfs_iput_new(ip, 0);
-                                        else
-                                                IRELE(ip);
-                                } else {
-                                        /*
-                                         * We can't read in the inode
-                                         * this bucket points to, or
-                                         * this inode is messed up.  Just
-                                         * ditch this bucket of inodes.  We
-                                         * will lose some inodes and space,
-                                         * but at least we won't hang.  Call
-                                         * xlog_recover_clear_agi_bucket()
-                                         * to perform a transaction to clear
-                                         * the inode pointer in the bucket.
-                                         */
-                                        xlog_recover_clear_agi_bucket(mp, agno,
-                                                        bucket);
-                                        agino = NULLAGINO;
-                                }
                                /*
                                 * Reacquire the agibuffer and continue around
-                                 * the loop.
+                                 * the loop. This should never fail as we know
+                                 * the buffer was good earlier on.
                                 */
-                                agibp = xfs_buf_read(mp->m_ddev_targp,
+                                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                                                XFS_AG_DADDR(mp, agno,
+                                ASSERT(error == 0);
-                                                        XFS_AGI_DADDR(mp)),
-                                                XFS_FSS_TO_BB(mp, 1), 0);
-                                if (XFS_BUF_ISERROR(agibp)) {
-                                        xfs_ioerror_alert(
-                                "xlog_recover_process_iunlinks(#2)",
-                                                log->l_mp, agibp,
-                                                XFS_AG_DADDR(mp, agno,
-                                                        XFS_AGI_DADDR(mp)));
-                                }
                                agi = XFS_BUF_TO_AGI(agibp);
-                                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
-                                        agi->agi_magicnum));
                        }
                }
@@ -3367,7 +3294,6 @@ xlog_pack_data(
        int                     size = iclog->ic_offset + roundoff;
        __be32                  cycle_lsn;
        xfs_caddr_t             dp;
-        xlog_in_core_2_t        *xhdr;
        xlog_pack_data_checksum(log, iclog, size);
@@ -3382,7 +3308,8 @@ xlog_pack_data(
        }
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+                xlog_in_core_2_t *xhdr = iclog->ic_data;
                for ( ; i < BTOBB(size); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3440,7 +3367,6 @@ xlog_unpack_data(
        xlog_t                  *log)
 {
        int                     i, j, k;
-        xlog_in_core_2_t        *xhdr;
        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3449,7 +3375,7 @@ xlog_unpack_data(
        }
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xhdr = (xlog_in_core_2_t *)rhead;
+                xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -4003,11 +3929,8 @@ xlog_recover_check_summary(
 {
        xfs_mount_t     *mp;
        xfs_agf_t       *agfp;
-        xfs_agi_t       *agip;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-        xfs_daddr_t     agfdaddr;
-        xfs_daddr_t     agidaddr;
        xfs_buf_t       *sbbp;
 #ifdef XFS_LOUD_RECOVERY
        xfs_sb_t        *sbp;
@@ -4016,6 +3939,7 @@ xlog_recover_check_summary(
        __uint64_t      freeblks;
        __uint64_t      itotal;
        __uint64_t      ifree;
+        int             error;
        mp = log->l_mp;
@@ -4023,37 +3947,27 @@ xlog_recover_check_summary(
        itotal = 0LL;
        ifree = 0LL;
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
-                agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
+                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
-                agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
+                if (error) {
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                        xfs_fs_cmn_err(CE_ALERT, mp,
-                if (XFS_BUF_ISERROR(agfbp)) {
+                                        "xlog_recover_check_summary(agf)"
-                        xfs_ioerror_alert("xlog_recover_check_summary(agf)",
+                                        "agf read failed agno %d error %d",
-                                                mp, agfbp, agfdaddr);
+                                                        agno, error);
-                }
+                } else {
-                agfp = XFS_BUF_TO_AGF(agfbp);
+                        agfp = XFS_BUF_TO_AGF(agfbp);
-                ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
+                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
-                ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
+                                    be32_to_cpu(agfp->agf_flcount);
-                ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
+                        xfs_buf_relse(agfbp);
-                freeblks += be32_to_cpu(agfp->agf_freeblks) +
-                            be32_to_cpu(agfp->agf_flcount);
-                xfs_buf_relse(agfbp);
-                agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-                agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
-                                XFS_FSS_TO_BB(mp, 1), 0);
-                if (XFS_BUF_ISERROR(agibp)) {
-                        xfs_ioerror_alert("xlog_recover_check_summary(agi)",
-                                          mp, agibp, agidaddr);
                }
-                agip = XFS_BUF_TO_AGI(agibp);
-                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
-                ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
-                ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
-                itotal += be32_to_cpu(agip->agi_count);
+                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                ifree += be32_to_cpu(agip->agi_freecount);
+                if (!error) {
-                xfs_buf_relse(agibp);
+                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
+                        itotal += be32_to_cpu(agi->agi_count);
+                        ifree += be32_to_cpu(agi->agi_freecount);
+                        xfs_buf_relse(agibp);
+                }
        }
        sbbp = xfs_getsb(mp, 0);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 15f5dd22fbb..3c97c6463a4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 STATIC void
 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 {
-        int     i;
        mp->m_agfrotor = mp->m_agirotor = 0;
        spin_lock_init(&mp->m_agirotor_lock);
        mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -577,12 +575,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-        mp->m_litino = sbp->sb_inodesize -
+        mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
-                ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
        mp->m_blockwmask = mp->m_blockwsize - 1;
-        INIT_LIST_HEAD(&mp->m_del_inodes);
        /*
         * Setup for attributes, in case they get created.
@@ -605,24 +601,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        }
        ASSERT(mp->m_attroffset < XFS_LITINO(mp));
-        for (i = 0; i < 2; i++) {
+        mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-                mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
-                        xfs_alloc, i == 0);
+        mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
-                mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+        mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
-                        xfs_alloc, i == 0);
-        }
+        mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-        for (i = 0; i < 2; i++) {
+        mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
-                mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
-                        xfs_bmbt, i == 0);
+        mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-                mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                        xfs_bmbt, i == 0);
+        mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-        }
+        mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
-        for (i = 0; i < 2; i++) {
+        mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
-                mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
-                        xfs_inobt, i == 0);
-                mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                        xfs_inobt, i == 0);
-        }
        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
@@ -1228,6 +1220,16 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
+        /*
+         * Release dquot that rootinode, rbmino and rsumino might be holding,
+         * and release the quota inodes.
+         */
+        XFS_QM_UNMOUNT(mp);
+        if (mp->m_rbmip)
+                IRELE(mp->m_rbmip);
+        if (mp->m_rsumip)
+                IRELE(mp->m_rsumip);
        IRELE(mp->m_rootip);
        /*
@@ -1241,7 +1243,7 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-        xfs_iflush_all(mp);
+        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
@@ -1288,11 +1290,6 @@ xfs_unmountfs(
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
-        /*
-         * All inodes from this mount point should be freed.
-         */
-        ASSERT(mp->m_inodes == NULL);
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
                uuid_table_remove(&mp->m_sb.sb_uuid);
@@ -1365,24 +1362,6 @@ xfs_log_sbcount(
        return error;
 }
-STATIC void
-xfs_mark_shared_ro(
-        xfs_mount_t     *mp,
-        xfs_buf_t       *bp)
-{
-        xfs_dsb_t       *sb = XFS_BUF_TO_SBP(bp);
-        __uint16_t      version;
-        if (!(sb->sb_flags & XFS_SBF_READONLY))
-                sb->sb_flags |= XFS_SBF_READONLY;
-        version = be16_to_cpu(sb->sb_versionnum);
-        if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 ||
-            !(version & XFS_SB_VERSION_SHAREDBIT))
-                version |= XFS_SB_VERSION_SHAREDBIT;
-        sb->sb_versionnum = cpu_to_be16(version);
-}
 int
 xfs_unmountfs_writesb(xfs_mount_t *mp)
 {
@@ -1398,12 +1377,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                sbp = xfs_getsb(mp, 0);
-                /*
-                 * mark shared-readonly if desired
-                 */
-                if (mp->m_mk_sharedro)
-                        xfs_mark_shared_ro(mp, sbp);
                XFS_BUF_UNDONE(sbp);
                XFS_BUF_UNREAD(sbp);
                XFS_BUF_UNDELAYWRITE(sbp);
@@ -1415,8 +1388,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
                                          mp, sbp, XFS_BUF_ADDR(sbp));
-                if (error && mp->m_mk_sharedro)
-                        xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
                xfs_buf_relse(sbp);
        }
        return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f3c1024b124..c1e02846732 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_MOUNT_H__
 #define __XFS_MOUNT_H__
 typedef struct xfs_trans_reservations {
        uint    tr_write;       /* extent alloc trans */
        uint    tr_itruncate;   /* truncate trans */
@@ -44,14 +43,16 @@ typedef struct xfs_trans_reservations {
 } xfs_trans_reservations_t;
 #ifndef __KERNEL__
-/*
- * Moved here from xfs_ag.h to avoid reordering header files
- */
 #define XFS_DADDR_TO_AGNO(mp,d) \
        ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
 #define XFS_DADDR_TO_AGBNO(mp,d) \
        ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-#else
+#else /* __KERNEL__ */
+#include "xfs_sync.h"
 struct cred;
 struct log;
 struct xfs_mount_args;
@@ -62,6 +63,7 @@ struct xfs_extdelta;
 struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
+struct xfs_ail;
 /*
 * Prototypes and functions for the Data Migration subsystem.
@@ -115,7 +117,7 @@ struct xfs_quotainfo;
 typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
 typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef int     (*xfs_qmunmount_t)(struct xfs_mount *);
+typedef void    (*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
 typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
 typedef int     (*xfs_dqattach_t)(struct xfs_inode *, uint);
@@ -132,7 +134,7 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
                        struct xfs_dquot **, struct xfs_dquot *);
 typedef int     (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
                        struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *);
+typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
 typedef int     (*xfs_dqsync_t)(struct xfs_mount *, int flags);
 typedef int     (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
@@ -223,18 +225,10 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
-typedef struct xfs_ail {
-        struct list_head        xa_ail;
-        uint                    xa_gen;
-        struct task_struct      *xa_task;
-        xfs_lsn_t               xa_target;
-} xfs_ail_t;
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
-        spinlock_t              m_ail_lock;     /* fs AIL mutex */
+        struct xfs_ail          *m_ail;         /* fs active log item list */
-        xfs_ail_t               m_ail;          /* fs active log item list */
        xfs_sb_t                m_sb;           /* copy of fs superblock */
        spinlock_t              m_sb_lock;      /* sb counter lock */
        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
@@ -247,10 +241,6 @@ typedef struct xfs_mount {
        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
        spinlock_t              m_agirotor_lock;/* .. and lock protecting it */
        xfs_agnumber_t          m_maxagi;       /* highest inode alloc group */
-        struct xfs_inode        *m_inodes;      /* active inode list */
-        struct list_head        m_del_inodes;   /* inodes to reclaim */
-        mutex_t                 m_ilock;        /* inode list mutex */
-        uint                    m_ireclaims;    /* count of calls to reclaim*/
        uint                    m_readio_log;   /* min read size log bytes */
        uint                    m_readio_blocks; /* min read size blocks */
        uint                    m_writeio_log;  /* min write size log bytes */
@@ -267,7 +257,6 @@ typedef struct xfs_mount {
        xfs_buftarg_t           *m_ddev_targp;  /* saves taking the address */
        xfs_buftarg_t           *m_logdev_targp;/* ptr to log device */
        xfs_buftarg_t           *m_rtdev_targp; /* ptr to rt device */
-        __uint8_t               m_dircook_elog; /* log d-cookie entry bits */
        __uint8_t               m_blkbit_log;   /* blocklog + NBBY */
        __uint8_t               m_blkbb_log;    /* blocklog - BBSHIFT */
        __uint8_t               m_agno_log;     /* log #ag's */
@@ -276,12 +265,12 @@ typedef struct xfs_mount {
        uint                    m_blockmask;    /* sb_blocksize-1 */
        uint                    m_blockwsize;   /* sb_blocksize in words */
        uint                    m_blockwmask;   /* blockwsize-1 */
-        uint                    m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
+        uint                    m_alloc_mxr[2]; /* max alloc btree records */
-        uint                    m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
+        uint                    m_alloc_mnr[2]; /* min alloc btree records */
-        uint                    m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
+        uint                    m_bmap_dmxr[2]; /* max bmap btree records */
-        uint                    m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
+        uint                    m_bmap_dmnr[2]; /* min bmap btree records */
-        uint                    m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
+        uint                    m_inobt_mxr[2]; /* max inobt btree records */
-        uint                    m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+        uint                    m_inobt_mnr[2]; /* min inobt btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
        uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
@@ -312,9 +301,6 @@ typedef struct xfs_mount {
        int                     m_sinoalign;    /* stripe unit inode alignment */
        int                     m_attr_magicpct;/* 37% of the blocksize */
        int                     m_dir_magicpct; /* 37% of the dir blocksize */
-        __uint8_t               m_mk_sharedro;  /* mark shared ro on unmount */
-        __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
-                                                   field governed by m_ilock */
        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
        const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
        int                     m_dirblksize;   /* directory block sz--bytes */
@@ -362,7 +348,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_ATTR2         (1ULL << 8)     /* allow use of attr2 format */
 #define XFS_MOUNT_GRPID         (1ULL << 9)     /* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY    (1ULL << 10)    /* no recovery - dirty fs */
-#define XFS_MOUNT_SHARED        (1ULL << 11)    /* shared mount */
 #define XFS_MOUNT_DFLT_IOSIZE   (1ULL << 12)    /* set default i/o size */
 #define XFS_MOUNT_OSYNCISOSYNC  (1ULL << 13)    /* o_sync is REALLY o_sync */
                                                /* osyncisdsync is now default*/
@@ -439,6 +424,16 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 #define xfs_force_shutdown(m,f) \
        xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+#define SHUTDOWN_META_IO_ERROR  0x0001  /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR   0x0002  /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT   0x0004  /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE 0x0008  /* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
+#define xfs_test_for_freeze(mp)         ((mp)->m_super->s_frozen)
+#define xfs_wait_for_freeze(mp,l)       vfs_check_frozen((mp)->m_super, (l))
 /*
 * Flags for xfs_mountfs
 */
@@ -508,14 +503,12 @@ typedef struct xfs_mod_sb {
 #define XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
 #define XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
-extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
 extern int      xfs_mountfs(xfs_mount_t *mp);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
 extern void     xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
-extern int      xfs_unmount_flush(xfs_mount_t *, int);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
 extern int      xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
                        int64_t, int);
@@ -525,20 +518,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
 extern int      xfs_fs_writable(xfs_mount_t *);
-extern int      xfs_syncsub(xfs_mount_t *, int, int *);
-extern int      xfs_sync_inodes(xfs_mount_t *, int, int *);
-extern xfs_agnumber_t   xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
-extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 extern int      xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
-extern int      xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int      xfs_dmops_get(struct xfs_mount *);
 extern void     xfs_dmops_put(struct xfs_mount *);
-extern int      xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int      xfs_qmops_get(struct xfs_mount *);
 extern void     xfs_qmops_put(struct xfs_mount *);
 extern struct xfs_dmops xfs_dmcore_xfs;
 #endif  /* __KERNEL__ */
+extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern xfs_agnumber_t   xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index a294e58db8d..27f80581520 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -28,7 +28,6 @@
 #include "xfs_mount.h"
 #include "xfs_quota.h"
 #include "xfs_error.h"
-#include "xfs_clnt.h"
 STATIC struct xfs_dquot *
@@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = {
 };
 int
-xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_qmops_get(struct xfs_mount *mp)
 {
-        if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) {
+        if (XFS_IS_QUOTA_RUNNING(mp)) {
 #ifdef CONFIG_XFS_QUOTA
                mp->m_qm_ops = &xfs_qmcore_xfs;
 #else
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 12c4ec775af..48965ecaa15 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -84,11 +84,9 @@ typedef struct xfs_dqblk {
 #define XFS_DQ_USER             0x0001          /* a user quota */
 #define XFS_DQ_PROJ             0x0002          /* project quota */
 #define XFS_DQ_GROUP            0x0004          /* a group quota */
-#define XFS_DQ_FLOCKED          0x0008          /* flush lock taken */
+#define XFS_DQ_DIRTY            0x0008          /* dquot is dirty */
-#define XFS_DQ_DIRTY            0x0010          /* dquot is dirty */
+#define XFS_DQ_WANT             0x0010          /* for lookup/reclaim race */
-#define XFS_DQ_WANT             0x0020          /* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE         0x0020          /* dq off mplist & hashlist */
-#define XFS_DQ_INACTIVE         0x0040          /* dq off mplist & hashlist */
-#define XFS_DQ_MARKER           0x0080          /* sentinel */
 #define XFS_DQ_ALLTYPES         (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index c903130be7f..86471bb40fd 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -42,31 +42,6 @@
 /*
- * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
- * If there are fewer than 4 entries in the array, the empty entries will
- * be at the end and will have NULL pointers in them.
- */
-STATIC void
-xfs_rename_unlock4(
-        xfs_inode_t     **i_tab,
-        uint            lock_mode)
-{
-        int     i;
-        xfs_iunlock(i_tab[0], lock_mode);
-        for (i = 1; i < 4; i++) {
-                if (i_tab[i] == NULL)
-                        break;
-                /*
-                 * Watch out for duplicate entries in the table.
-                 */
-                if (i_tab[i] != i_tab[i-1])
-                        xfs_iunlock(i_tab[i], lock_mode);
-        }
-}
-/*
 * Enter all inodes for a rename transaction into a sorted array.
 */
 STATIC void
@@ -205,19 +180,6 @@ xfs_rename(
        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
        /*
-         * If we are using project inheritance, we only allow renames
-         * into our tree when the project IDs are the same; else the
-         * tree quota mechanism would be circumvented.
-         */
-        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
-                error = XFS_ERROR(EXDEV);
-                xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
-                xfs_trans_cancel(tp, cancel_flags);
-                goto std_return;
-        }
-        /*
         * Join all the inodes to the transaction. From this point on,
         * we can rely on either trans_commit or trans_cancel to unlock
         * them.  Note that we need to add a vnode reference to the
@@ -242,6 +204,17 @@ xfs_rename(
        }
        /*
+         * If we are using project inheritance, we only allow renames
+         * into our tree when the project IDs are the same; else the
+         * tree quota mechanism would be circumvented.
+         */
+        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+                error = XFS_ERROR(EXDEV);
+                goto error_return;
+        }
+        /*
         * Set up the target.
         */
        if (target_ip == NULL) {
@@ -367,19 +340,11 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
        if (error)
                goto abort_return;
-        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
+        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-         * Update the generation counts on all the directory inodes
-         * that we're modifying.
-         */
-        src_dp->i_gen++;
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+        if (new_parent)
-        if (new_parent) {
-                target_dp->i_gen++;
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-        }
        /*
         * If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e2f68de1615..edf12c7b834 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -85,7 +85,6 @@ xfs_growfs_rt_alloc(
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
-        int             cancelflags;    /* flags for xfs_trans_cancel */
        int             committed;      /* transaction committed flag */
        xfs_daddr_t     d;              /* disk block address */
        int             error;          /* error return value */
@@ -96,15 +95,16 @@ xfs_growfs_rt_alloc(
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
-        xfs_trans_t     *tp;            /* transaction pointer */
        /*
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
+                int             cancelflags = 0;
+                xfs_trans_t     *tp;
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
-                cancelflags = 0;
                /*
                 * Reserve space & log for one extent added to the file.
                 */
@@ -171,7 +171,9 @@ xfs_growfs_rt_alloc(
                                mp->m_bsize, 0);
                        if (bp == NULL) {
                                error = XFS_ERROR(EIO);
-                                goto error_cancel;
+error_cancel:
+                                xfs_trans_cancel(tp, cancelflags);
+                                goto error;
                        }
                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -188,8 +190,6 @@ xfs_growfs_rt_alloc(
                oblocks = map.br_startoff + map.br_blockcount;
        }
        return 0;
-error_cancel:
-        xfs_trans_cancel(tp, cancelflags);
 error:
        return error;
 }
@@ -1856,7 +1856,6 @@ xfs_growfs_rt(
 {
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
-        int             cancelflags;    /* flags for xfs_trans_cancel */
        int             error;          /* error return value */
        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
@@ -1872,13 +1871,13 @@ xfs_growfs_rt(
        xfs_extlen_t    rsumblocks;     /* current number of rt summary blks */
        xfs_sb_t        *sbp;           /* old superblock */
        xfs_fsblock_t   sumbno;         /* summary block number */
-        xfs_trans_t     *tp;            /* transaction pointer */
        sbp = &mp->m_sb;
-        cancelflags = 0;
        /*
         * Initial error checking.
         */
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
@@ -1942,6 +1941,9 @@ xfs_growfs_rt(
                     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
             bmbno < nrbmblocks;
             bmbno++) {
+                xfs_trans_t     *tp;
+                int             cancelflags = 0;
                *nmp = *mp;
                nsbp = &nmp->m_sb;
                /*
@@ -1967,16 +1969,15 @@ xfs_growfs_rt(
                 * Start a transaction, get the log reservation.
                 */
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-                cancelflags = 0;
                if ((error = xfs_trans_reserve(tp, 0,
                                XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
-                        break;
+                        goto error_cancel;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        break;
+                        goto error_cancel;
                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
@@ -1990,7 +1991,7 @@ xfs_growfs_rt(
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        break;
+                        goto error_cancel;
                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
@@ -2005,7 +2006,7 @@ xfs_growfs_rt(
                    mp->m_rsumlevels != nmp->m_rsumlevels) {
                        error = xfs_rtcopy_summary(mp, nmp, tp);
                        if (error)
-                                break;
+                                goto error_cancel;
                }
                /*
                 * Update superblock fields.
@@ -2031,8 +2032,11 @@ xfs_growfs_rt(
                bp = NULL;
                error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
                        nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
-                if (error)
+                if (error) {
+error_cancel:
+                        xfs_trans_cancel(tp, cancelflags);
                        break;
+                }
                /*
                 * Mark more blocks free in the superblock.
                 */
@@ -2045,15 +2049,10 @@ xfs_growfs_rt(
                mp->m_rsumsize = nrsumsize;
                error = xfs_trans_commit(tp, 0);
-                if (error) {
+                if (error)
-                        tp = NULL;
                        break;
-                }
        }
-        if (error && tp)
-                xfs_trans_cancel(tp, cancelflags);
        /*
         * Free the fake mp structure.
         */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3a82576dde9..36f3a21c54d 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -406,7 +406,7 @@ xfs_bwrite(
         * XXXsup how does this work for quotas.
         */
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
-        XFS_BUF_SET_FSPRIVATE3(bp, mp);
+        bp->b_mount = mp;
        XFS_BUF_WRITE(bp);
        if ((error = XFS_bwrite(bp))) {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 3f8cf1587f4..1ed71916e4c 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -79,6 +79,7 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_LAZYSBCOUNTBIT  0x00000002      /* Superblk counters */
 #define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -296,30 +297,34 @@ typedef enum {
 #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-#ifdef __KERNEL__
 static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 {
-        return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
+        /* We always support version 1-3 */
-                  (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
+        if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
-                   ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+            sbp->sb_versionnum <= XFS_SB_VERSION_3)
-                    !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
+                return 1;
-                      ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
-                       (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
+        /* We support version 4 if all feature bits are supported */
-                    (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)));
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
-}
+                if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
+                    ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+                     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+                        return 0;
+#ifdef __KERNEL__
+                if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+                        return 0;
 #else
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
+                if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
-{
+                    sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
-        return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
+                        return 0;
-                  (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
+#endif
-                   ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-                    !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
+                return 1;
-                      ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
+        }
-                       (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
-                  (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
+        return 0;
-                   (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))));
 }
-#endif /* __KERNEL__ */
 /*
 * Detect a mismatched features2 field.  Older kernels read/wrote
@@ -332,123 +337,127 @@ static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
 static inline unsigned xfs_sb_version_tonew(unsigned v)
 {
-        return ((((v) == XFS_SB_VERSION_1) ? \
+        if (v == XFS_SB_VERSION_1)
-                0 : \
+                return XFS_SB_VERSION_4;
-                (((v) == XFS_SB_VERSION_2) ? \
-                        XFS_SB_VERSION_ATTRBIT : \
+        if (v == XFS_SB_VERSION_2)
-                        (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
+                return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
-                XFS_SB_VERSION_4);
+        return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
+                XFS_SB_VERSION_NLINKBIT;
 }
 static inline unsigned xfs_sb_version_toold(unsigned v)
 {
-        return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
+        if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
-                0 : \
+                return 0;
-                (((v) & XFS_SB_VERSION_NLINKBIT) ? \
+        if (v & XFS_SB_VERSION_NLINKBIT)
-                        XFS_SB_VERSION_3 : \
+                return XFS_SB_VERSION_3;
-                        (((v) & XFS_SB_VERSION_ATTRBIT) ?  \
+        if (v & XFS_SB_VERSION_ATTRBIT)
-                                XFS_SB_VERSION_2 : \
+                return XFS_SB_VERSION_2;
-                                XFS_SB_VERSION_1)));
+        return XFS_SB_VERSION_1;
 }
 static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
 {
-        return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
+        return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
-                 ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+                sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-                 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+                (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                  ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+                 (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
 }
 static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
+        if (sbp->sb_versionnum == XFS_SB_VERSION_1)
-                XFS_SB_VERSION_2 : \
+                sbp->sb_versionnum = XFS_SB_VERSION_2;
-                ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
+        else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-                        ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
+                sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
-                        (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT)));
+        else
+                sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
 }
 static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
 {
-        return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+        return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-                 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+                 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                  ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+                  (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
 }
 static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
+        if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
-                XFS_SB_VERSION_3 : \
+                sbp->sb_versionnum = XFS_SB_VERSION_3;
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT));
+        else
+                sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
 }
 static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
 }
 static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = \
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-                 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
+                sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
-                        ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
+        else
-                        (xfs_sb_version_tonew((sbp)->sb_versionnum) | \
+                sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
-                         XFS_SB_VERSION_QUOTABIT));
+                                        XFS_SB_VERSION_QUOTABIT;
 }
 static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
 }
 static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
 }
 static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
 }
 static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
 }
 static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
 }
 static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
 }
 static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
 static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
                (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
 }
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
 }
 /*
@@ -463,22 +472,20 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
 {
-        return (xfs_sb_version_hasmorebits(sbp) &&      \
+        return xfs_sb_version_hasmorebits(sbp) &&
-                ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+                (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
 }
 static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
 {
-        return (xfs_sb_version_hasmorebits(sbp)) &&     \
+        return xfs_sb_version_hasmorebits(sbp) &&
-                ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+                (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
 }
 static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
 {
-        ((sbp)->sb_versionnum = \
+        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT),    \
+        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-        ((sbp)->sb_features2 =  \
-                ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
 }
 static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4e1c22a23be..8570b826fed 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -290,7 +290,7 @@ xfs_trans_dup(
        ASSERT(tp->t_ticket != NULL);
        ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
-        ntp->t_ticket = tp->t_ticket;
+        ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
        ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
@@ -1260,6 +1260,13 @@ xfs_trans_roll(
        trans = *tpp;
        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(trans->t_ticket);
+        /*
         * Reserve space in the log for th next transaction.
         * This also pushes items in the "AIL", the list of logged items,
         * out to disk if they are taking up space at the tail of the log
@@ -1383,11 +1390,12 @@ xfs_trans_chunk_committed(
        xfs_log_item_desc_t     *lidp;
        xfs_log_item_t          *lip;
        xfs_lsn_t               item_lsn;
-        struct xfs_mount        *mp;
        int                     i;
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                struct xfs_ail          *ailp;
                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
@@ -1424,19 +1432,19 @@ xfs_trans_chunk_committed(
                 * This would cause the earlier transaction to fail
                 * the test below.
                 */
-                mp = lip->li_mountp;
+                ailp = lip->li_ailp;
-                spin_lock(&mp->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
                        /*
                         * This will set the item's lsn to item_lsn
                         * and update the position of the item in
                         * the AIL.
                         *
-                         * xfs_trans_update_ail() drops the AIL lock.
+                         * xfs_trans_ail_update() drops the AIL lock.
                         */
-                        xfs_trans_update_ail(mp, lip, item_lsn);
+                        xfs_trans_ail_update(ailp, lip, item_lsn);
                } else {
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
                /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 74c80bd2b0e..d6fe4a88d79 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_TRANS_H__
 #define __XFS_TRANS_H__
+struct xfs_log_item;
 /*
 * This is the structure written in the log at the head of
 * every transaction. It identifies the type and id of the
@@ -98,76 +100,6 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_TYPE_MAX              41
 /* new transaction types need to be reflected in xfs_logprint(8) */
-#ifdef __KERNEL__
-struct xfs_buf;
-struct xfs_buftarg;
-struct xfs_efd_log_item;
-struct xfs_efi_log_item;
-struct xfs_inode;
-struct xfs_item_ops;
-struct xfs_log_iovec;
-struct xfs_log_item;
-struct xfs_log_item_desc;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot_acct;
-typedef struct xfs_log_item {
-        struct list_head                li_ail;         /* AIL pointers */
-        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
-        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
-        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
-        uint                            li_type;        /* item type */
-        uint                            li_flags;       /* misc flags */
-        struct xfs_log_item             *li_bio_list;   /* buffer item list */
-        void                            (*li_cb)(struct xfs_buf *,
-                                                 struct xfs_log_item *);
-                                                        /* buffer item iodone */
-                                                        /* callback func */
-        struct xfs_item_ops             *li_ops;        /* function list */
-} xfs_log_item_t;
-#define XFS_LI_IN_AIL   0x1
-#define XFS_LI_ABORTED  0x2
-typedef struct xfs_item_ops {
-        uint (*iop_size)(xfs_log_item_t *);
-        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
-        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *, int);
-        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
-        uint (*iop_trylock)(xfs_log_item_t *);
-        void (*iop_unlock)(xfs_log_item_t *);
-        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
-        void (*iop_push)(xfs_log_item_t *);
-        void (*iop_pushbuf)(xfs_log_item_t *);
-        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
-#define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
-#define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
-#define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
-#define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-/*
- * Return values for the IOP_TRYLOCK() routines.
- */
-#define XFS_ITEM_SUCCESS        0
-#define XFS_ITEM_PINNED         1
-#define XFS_ITEM_LOCKED         2
-#define XFS_ITEM_FLUSHING       3
-#define XFS_ITEM_PUSHBUF        4
-#endif  /* __KERNEL__ */
 /*
 * This structure is used to track log items associated with
 * a transaction.  It points to the log item and keeps some
@@ -176,7 +108,7 @@ typedef struct xfs_item_ops {
 * once we get to commit processing (see xfs_trans_commit()).
 */
 typedef struct xfs_log_item_desc {
-        xfs_log_item_t  *lid_item;
+        struct xfs_log_item     *lid_item;
        ushort          lid_size;
        unsigned char   lid_flags;
        unsigned char   lid_index;
@@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
                (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
 }
-#ifdef __KERNEL__
-/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-typedef struct xfs_log_busy_slot {
-        xfs_agnumber_t          lbc_ag;
-        ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-#define XFS_LBC_NUM_SLOTS       31
-typedef struct xfs_log_busy_chunk {
-        struct xfs_log_busy_chunk       *lbc_next;
-        uint                            lbc_free;       /* free slots bitmask */
-        ushort                          lbc_unused;     /* first unused */
-        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
-/*
- * This is the structure maintained for every active transaction.
- */
-typedef struct xfs_trans {
-        unsigned int            t_magic;        /* magic number */
-        xfs_log_callback_t      t_logcb;        /* log callback struct */
-        unsigned int            t_type;         /* transaction type */
-        unsigned int            t_log_res;      /* amt of log space resvd */
-        unsigned int            t_log_count;    /* count for perm log res */
-        unsigned int            t_blk_res;      /* # of blocks resvd */
-        unsigned int            t_blk_res_used; /* # of resvd blocks used */
-        unsigned int            t_rtx_res;      /* # of rt extents resvd */
-        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
-        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
-        xfs_lsn_t               t_lsn;          /* log seq num of start of
-                                                 * transaction. */
-        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
-                                                 * transaction. */
-        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
-        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-        xfs_trans_callback_t    t_callback;     /* transaction callback */
-        void                    *t_callarg;     /* callback arg */
-        unsigned int            t_flags;        /* misc flags */
-        int64_t                 t_icount_delta; /* superblock icount change */
-        int64_t                 t_ifree_delta;  /* superblock ifree change */
-        int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
-        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
-        int64_t                 t_frextents_delta;/* superblock freextents chg*/
-        int64_t                 t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
-        int64_t                 t_ag_freeblks_delta; /* debugging counter */
-        int64_t                 t_ag_flist_delta; /* debugging counter */
-        int64_t                 t_ag_btree_delta; /* debugging counter */
-#endif
-        int64_t                 t_dblocks_delta;/* superblock dblocks change */
-        int64_t                 t_agcount_delta;/* superblock agcount change */
-        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
-        int64_t                 t_rextsize_delta;/* superblock rextsize chg */
-        int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
-        int64_t                 t_rblocks_delta;/* superblock rblocks change */
-        int64_t                 t_rextents_delta;/* superblocks rextents chg */
-        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
-        unsigned int            t_items_free;   /* log item descs free */
-        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
-        xfs_trans_header_t      t_header;       /* header for in-log trans */
-        unsigned int            t_busy_free;    /* busy descs free */
-        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
-        unsigned long           t_pflags;       /* saved process flags state */
-} xfs_trans_t;
-#endif  /* __KERNEL__ */
 #define XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
 /*
 * Values for t_flags.
@@ -906,6 +750,157 @@ typedef struct xfs_trans {
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+typedef struct xfs_log_item {
+        struct list_head                li_ail;         /* AIL pointers */
+        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
+        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
+        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
+        struct xfs_ail                  *li_ailp;       /* ptr to AIL */
+        uint                            li_type;        /* item type */
+        uint                            li_flags;       /* misc flags */
+        struct xfs_log_item             *li_bio_list;   /* buffer item list */
+        void                            (*li_cb)(struct xfs_buf *,
+                                                 struct xfs_log_item *);
+                                                        /* buffer item iodone */
+                                                        /* callback func */
+        struct xfs_item_ops             *li_ops;        /* function list */
+} xfs_log_item_t;
+#define XFS_LI_IN_AIL   0x1
+#define XFS_LI_ABORTED  0x2
+typedef struct xfs_item_ops {
+        uint (*iop_size)(xfs_log_item_t *);
+        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+        void (*iop_pin)(xfs_log_item_t *);
+        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+        uint (*iop_trylock)(xfs_log_item_t *);
+        void (*iop_unlock)(xfs_log_item_t *);
+        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+        void (*iop_push)(xfs_log_item_t *);
+        void (*iop_pushbuf)(xfs_log_item_t *);
+        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+#define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS        0
+#define XFS_ITEM_PINNED         1
+#define XFS_ITEM_LOCKED         2
+#define XFS_ITEM_FLUSHING       3
+#define XFS_ITEM_PUSHBUF        4
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+typedef struct xfs_log_busy_slot {
+        xfs_agnumber_t          lbc_ag;
+        ushort                  lbc_idx;        /* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+#define XFS_LBC_NUM_SLOTS       31
+typedef struct xfs_log_busy_chunk {
+        struct xfs_log_busy_chunk       *lbc_next;
+        uint                            lbc_free;       /* free slots bitmask */
+        ushort                          lbc_unused;     /* first unused */
+        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
+#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
+#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
+#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+        unsigned int            t_magic;        /* magic number */
+        xfs_log_callback_t      t_logcb;        /* log callback struct */
+        unsigned int            t_type;         /* transaction type */
+        unsigned int            t_log_res;      /* amt of log space resvd */
+        unsigned int            t_log_count;    /* count for perm log res */
+        unsigned int            t_blk_res;      /* # of blocks resvd */
+        unsigned int            t_blk_res_used; /* # of resvd blocks used */
+        unsigned int            t_rtx_res;      /* # of rt extents resvd */
+        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
+        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
+        xfs_lsn_t               t_lsn;          /* log seq num of start of
+                                                 * transaction. */
+        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
+                                                 * transaction. */
+        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
+        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
+        xfs_trans_callback_t    t_callback;     /* transaction callback */
+        void                    *t_callarg;     /* callback arg */
+        unsigned int            t_flags;        /* misc flags */
+        int64_t                 t_icount_delta; /* superblock icount change */
+        int64_t                 t_ifree_delta;  /* superblock ifree change */
+        int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
+        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
+        int64_t                 t_frextents_delta;/* superblock freextents chg*/
+        int64_t                 t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+        int64_t                 t_ag_freeblks_delta; /* debugging counter */
+        int64_t                 t_ag_flist_delta; /* debugging counter */
+        int64_t                 t_ag_btree_delta; /* debugging counter */
+#endif
+        int64_t                 t_dblocks_delta;/* superblock dblocks change */
+        int64_t                 t_agcount_delta;/* superblock agcount change */
+        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
+        int64_t                 t_rextsize_delta;/* superblock rextsize chg */
+        int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
+        int64_t                 t_rblocks_delta;/* superblock rblocks change */
+        int64_t                 t_rextents_delta;/* superblocks rextents chg */
+        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
+        unsigned int            t_items_free;   /* log item descs free */
+        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
+        xfs_trans_header_t      t_header;       /* header for in-log trans */
+        unsigned int            t_busy_free;    /* busy descs free */
+        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
+        unsigned long           t_pflags;       /* saved process flags state */
+} xfs_trans_t;
 /*
 * XFS transaction mechanism exported interfaces that are
 * actually macros.
@@ -928,7 +923,6 @@ typedef struct xfs_trans {
 /*
 * XFS transaction mechanism exported interfaces.
 */
-void            xfs_trans_init(struct xfs_mount *);
 xfs_trans_t     *xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t     *_xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t     *xfs_trans_dup(xfs_trans_t *);
@@ -975,13 +969,8 @@ int		_xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)     _xfs_trans_commit(tp, flags, NULL)
 void            xfs_trans_cancel(xfs_trans_t *, int);
-int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
-void            xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
-xfs_lsn_t       xfs_trans_tail_ail(struct xfs_mount *);
-void            xfs_trans_unlocked_item(struct xfs_mount *,
-                                        xfs_log_item_t *);
 xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
                                        xfs_agnumber_t ag,
                                        xfs_extlen_t idx);
@@ -990,4 +979,7 @@ extern kmem_zone_t	*xfs_trans_zone;
 #endif  /* __KERNEL__ */
+void            xfs_trans_init(struct xfs_mount *);
+int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 #endif  /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1f77c00af56..2d47f10f8be 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -28,13 +29,13 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
+STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
 #else
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
 * lsn of the last item in the AIL.
 */
 xfs_lsn_t
-xfs_trans_tail_ail(
+xfs_trans_ail_tail(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
-        lip = xfs_ail_min(&mp->m_ail);
+        lip = xfs_ail_min(ailp);
        if (lip == NULL) {
                lsn = (xfs_lsn_t)0;
        } else {
                lsn = lip->li_lsn;
        }
-        spin_unlock(&mp->m_ail_lock);
+        spin_unlock(&ailp->xa_lock);
        return lsn;
 }
@@ -85,16 +86,125 @@ xfs_trans_tail_ail(
 * any of the objects, so the lock is not needed.
 */
 void
-xfs_trans_push_ail(
+xfs_trans_ail_push(
-        xfs_mount_t             *mp,
+        struct xfs_ail  *ailp,
-        xfs_lsn_t               threshold_lsn)
+        xfs_lsn_t       threshold_lsn)
 {
-        xfs_log_item_t          *lip;
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(ailp);
+        if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+                if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+                        xfsaild_wakeup(ailp, threshold_lsn);
+        }
+}
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next ƣtem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur)
+{
+        cur->item = NULL;
+        if (cur == &ailp->xa_cursors)
+                return;
+        cur->next = ailp->xa_cursors.next;
+        ailp->xa_cursors.next = cur;
+}
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur,
+        struct xfs_log_item     *lip)
+{
+        if (lip)
+                cur->item = xfs_ail_next(ailp, lip);
+}
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur)
+{
+        struct xfs_log_item     *lip = cur->item;
+        if ((__psint_t)lip & 1)
+                lip = xfs_ail_min(ailp);
+        xfs_trans_ail_cursor_set(ailp, cur, lip);
+        return lip;
+}
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is alway present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *done)
+{
+        struct xfs_ail_cursor   *prev = NULL;
+        struct xfs_ail_cursor   *cur;
+        done->item = NULL;
+        if (done == &ailp->xa_cursors)
+                return;
+        prev = &ailp->xa_cursors;
+        for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+                if (cur == done) {
+                        prev->next = cur->next;
+                        break;
+                }
+        }
+        ASSERT(cur);
+}
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip)
+{
+        struct xfs_ail_cursor   *cur;
-        lip = xfs_ail_min(&mp->m_ail);
+        /* need to search all cursors */
-        if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
+        for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
-                if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
+                if (cur->item == lip)
-                        xfsaild_wakeup(mp, threshold_lsn);
+                        cur->item = (struct xfs_log_item *)
+                                        ((__psint_t)cur->item | 1);
        }
 }
@@ -103,25 +213,27 @@ xfs_trans_push_ail(
 * Return the current tree generation number for use
 * in calls to xfs_trans_next_ail().
 */
-STATIC xfs_log_item_t *
+xfs_log_item_t *
-xfs_trans_first_push_ail(
+xfs_trans_ail_cursor_first(
-        xfs_mount_t     *mp,
+        struct xfs_ail          *ailp,
-        int             *gen,
+        struct xfs_ail_cursor   *cur,
-        xfs_lsn_t       lsn)
+        xfs_lsn_t               lsn)
 {
-        xfs_log_item_t  *lip;
+        xfs_log_item_t          *lip;
-        lip = xfs_ail_min(&mp->m_ail);
+        xfs_trans_ail_cursor_init(ailp, cur);
-        *gen = (int)mp->m_ail.xa_gen;
+        lip = xfs_ail_min(ailp);
        if (lsn == 0)
-                return lip;
+                goto out;
-        list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
                if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
-                        return lip;
+                        goto out;
        }
+        lip = NULL;
-        return NULL;
+out:
+        xfs_trans_ail_cursor_set(ailp, cur, lip);
+        return lip;
 }
 /*
@@ -129,29 +241,29 @@ xfs_trans_first_push_ail(
 */
 long
 xfsaild_push(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_lsn_t       *last_lsn)
 {
        long            tout = 1000; /* milliseconds */
        xfs_lsn_t       last_pushed_lsn = *last_lsn;
-        xfs_lsn_t       target =  mp->m_ail.xa_target;
+        xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-        int             gen;
-        int             restarts;
        int             flush_log, count, stuck;
+        xfs_mount_t     *mp = ailp->xa_mount;
+        struct xfs_ail_cursor   *cur = &ailp->xa_cursors;
-#define XFS_TRANS_PUSH_AIL_RESTARTS     10
+        spin_lock(&ailp->xa_lock);
+        xfs_trans_ail_cursor_init(ailp, cur);
-        spin_lock(&mp->m_ail_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
-        lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
-                spin_unlock(&mp->m_ail_lock);
+                xfs_trans_ail_cursor_done(ailp, cur);
+                spin_unlock(&ailp->xa_lock);
                last_pushed_lsn = 0;
-                goto out;
+                return tout;
        }
        XFS_STATS_INC(xs_push_ail);
@@ -169,7 +281,7 @@ xfsaild_push(
         */
        tout = 10;
        lsn = lip->li_lsn;
-        flush_log = stuck = count = restarts = 0;
+        flush_log = stuck = count = 0;
        while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
                int     lock_result;
                /*
@@ -184,7 +296,7 @@ xfsaild_push(
                 * skip to the next item in the list.
                 */
                lock_result = IOP_TRYLOCK(lip);
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
                switch (lock_result) {
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
@@ -221,7 +333,7 @@ xfsaild_push(
                        break;
                }
-                spin_lock(&mp->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                /* should we bother continuing? */
                if (XFS_FORCED_SHUTDOWN(mp))
                        break;
@@ -244,14 +356,13 @@ xfsaild_push(
                if (stuck > 100)
                        break;
-                lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+                lip = xfs_trans_ail_cursor_next(ailp, cur);
                if (lip == NULL)
                        break;
-                if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS)
-                        break;
                lsn = lip->li_lsn;
        }
-        spin_unlock(&mp->m_ail_lock);
+        xfs_trans_ail_cursor_done(ailp, cur);
+        spin_unlock(&ailp->xa_lock);
        if (flush_log) {
                /*
@@ -274,8 +385,7 @@ xfsaild_push(
                 */
                tout += 20;
                last_pushed_lsn = 0;
-        } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) ||
+        } else if ((stuck * 100) / count > 90) {
-                   ((stuck * 100) / count > 90)) {
                /*
                 * Either there is a lot of contention on the AIL or we
                 * are stuck due to operations in progress. "Stuck" in this
@@ -287,7 +397,6 @@ xfsaild_push(
                 */
                tout += 10;
        }
-out:
        *last_lsn = last_pushed_lsn;
        return tout;
 }       /* xfsaild_push */
@@ -303,7 +412,7 @@ out:
 */
 void
 xfs_trans_unlocked_item(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *min_lip;
@@ -315,7 +424,7 @@ xfs_trans_unlocked_item(
         * over some potentially valid data.
         */
        if (!(lip->li_flags & XFS_LI_IN_AIL) ||
-            XFS_FORCED_SHUTDOWN(mp)) {
+            XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
                return;
        }
@@ -331,10 +440,10 @@ xfs_trans_unlocked_item(
         * the call to xfs_log_move_tail() doesn't do anything if there's
         * not enough free space to wake people up so we're safe calling it.
         */
-        min_lip = xfs_ail_min(&mp->m_ail);
+        min_lip = xfs_ail_min(ailp);
        if (min_lip == lip)
-                xfs_log_move_tail(mp, 1);
+                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
@@ -347,41 +456,37 @@ xfs_trans_unlocked_item(
 * we move in the AIL is the minimum one, update the tail lsn in the
 * log manager.
 *
- * Increment the AIL's generation count to indicate that the tree
- * has changed.
- *
 * This function must be called with the AIL lock held.  The lock
 * is dropped before returning.
 */
 void
-xfs_trans_update_ail(
+xfs_trans_ail_update(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip,
-        xfs_lsn_t       lsn) __releases(mp->m_ail_lock)
+        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip=NULL;
+        xfs_log_item_t          *dlip = NULL;
        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
-        mlip = xfs_ail_min(&mp->m_ail);
+        mlip = xfs_ail_min(ailp);
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                dlip = xfs_ail_delete(&mp->m_ail, lip);
+                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+                xfs_trans_ail_cursor_clear(ailp, dlip);
        } else {
                lip->li_flags |= XFS_LI_IN_AIL;
        }
        lip->li_lsn = lsn;
+        xfs_ail_insert(ailp, lip);
-        xfs_ail_insert(&mp->m_ail, lip);
-        mp->m_ail.xa_gen++;
        if (mlip == dlip) {
-                mlip = xfs_ail_min(&mp->m_ail);
+                mlip = xfs_ail_min(ailp);
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(mp, mlip->li_lsn);
+                xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
        } else {
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
@@ -403,29 +508,30 @@ xfs_trans_update_ail(
 * is dropped before returning.
 */
 void
-xfs_trans_delete_ail(
+xfs_trans_ail_delete(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip) __releases(mp->m_ail_lock)
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                mlip = xfs_ail_min(&mp->m_ail);
+                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(&mp->m_ail, lip);
+                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+                xfs_trans_ail_cursor_clear(ailp, dlip);
                lip->li_flags &= ~XFS_LI_IN_AIL;
                lip->li_lsn = 0;
-                mp->m_ail.xa_gen++;
                if (mlip == dlip) {
-                        mlip = xfs_ail_min(&mp->m_ail);
+                        mlip = xfs_ail_min(ailp);
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+                        xfs_log_move_tail(ailp->xa_mount,
+                                                (mlip ? mlip->li_lsn : 0));
                } else {
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
        }
        else {
@@ -433,13 +539,13 @@ xfs_trans_delete_ail(
                 * If the file system is not being shutdown, we are in
                 * serious trouble if we get to this stage.
                 */
-                if (XFS_FORCED_SHUTDOWN(mp))
+                struct xfs_mount        *mp = ailp->xa_mount;
-                        spin_unlock(&mp->m_ail_lock);
-                else {
+                spin_unlock(&ailp->xa_lock);
+                if (!XFS_FORCED_SHUTDOWN(mp)) {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                "%s: attempting to delete a log item that is not in the AIL",
                                        __func__);
-                        spin_unlock(&mp->m_ail_lock);
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
        }
@@ -448,56 +554,6 @@ xfs_trans_delete_ail(
 /*
- * Return the item in the AIL with the smallest lsn.
- * Return the current tree generation number for use
- * in calls to xfs_trans_next_ail().
- */
-xfs_log_item_t *
-xfs_trans_first_ail(
-        xfs_mount_t     *mp,
-        int             *gen)
-{
-        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(&mp->m_ail);
-        *gen = (int)mp->m_ail.xa_gen;
-        return lip;
-}
-/*
- * If the generation count of the tree has not changed since the
- * caller last took something from the AIL, then return the elmt
- * in the tree which follows the one given.  If the count has changed,
- * then return the minimum elmt of the AIL and bump the restarts counter
- * if one is given.
- */
-xfs_log_item_t *
-xfs_trans_next_ail(
-        xfs_mount_t     *mp,
-        xfs_log_item_t  *lip,
-        int             *gen,
-        int             *restarts)
-{
-        xfs_log_item_t  *nlip;
-        ASSERT(mp && lip && gen);
-        if (mp->m_ail.xa_gen == *gen) {
-                nlip = xfs_ail_next(&mp->m_ail, lip);
-        } else {
-                nlip = xfs_ail_min(&mp->m_ail);
-                *gen = (int)mp->m_ail.xa_gen;
-                if (restarts != NULL) {
-                        XFS_STATS_INC(xs_push_ail_restarts);
-                        (*restarts)++;
-                }
-        }
-        return (nlip);
-}
-/*
 * The active item list (AIL) is a doubly linked list of log
 * items sorted by ascending lsn.  The base of the list is
 * a forw/back pointer pair embedded in the xfs mount structure.
@@ -515,15 +571,35 @@ int
 xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
-        INIT_LIST_HEAD(&mp->m_ail.xa_ail);
+        struct xfs_ail  *ailp;
-        return xfsaild_start(mp);
+        int             error;
+        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+        if (!ailp)
+                return ENOMEM;
+        ailp->xa_mount = mp;
+        INIT_LIST_HEAD(&ailp->xa_ail);
+        spin_lock_init(&ailp->xa_lock);
+        error = xfsaild_start(ailp);
+        if (error)
+                goto out_free_ailp;
+        mp->m_ail = ailp;
+        return 0;
+out_free_ailp:
+        kmem_free(ailp);
+        return error;
 }
 void
 xfs_trans_ail_destroy(
        xfs_mount_t     *mp)
 {
-        xfsaild_stop(mp);
+        struct xfs_ail  *ailp = mp->m_ail;
+        xfsaild_stop(ailp);
+        kmem_free(ailp);
 }
 /*
@@ -534,7 +610,7 @@ xfs_trans_ail_destroy(
 */
 STATIC void
 xfs_ail_insert(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -568,7 +644,7 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -585,7 +661,7 @@ xfs_ail_delete(
 */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-        xfs_ail_t       *ailp)
+        struct xfs_ail  *ailp)
 /* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
@@ -601,7 +677,7 @@ xfs_ail_min(
 */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -617,7 +693,7 @@ xfs_ail_next(
 */
 STATIC void
 xfs_ail_check(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *prev_lip;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4e855b5ced6..8ee2f8c8b0a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t	*tp,
                        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                        if (lip->li_type == XFS_LI_BUF) {
                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                                xfs_trans_unlocked_item(
+                                xfs_trans_unlocked_item(bip->bli_item.li_ailp,
-                                                bip->bli_item.li_mountp,
+                                                        lip);
-                                                lip);
                        }
                }
                xfs_buf_relse(bp);
@@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * tell the AIL that the buffer is being unlocked.
         */
        if (bip != NULL) {
-                xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+                xfs_trans_unlocked_item(bip->bli_item.li_ailp,
                                        (xfs_log_item_t*)bip);
        }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2a1c0f071f9..23d276af2e0 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -85,7 +85,6 @@ xfs_trans_iget(
 {
        int                     error;
        xfs_inode_t             *ip;
-        xfs_inode_log_item_t    *iip;
        /*
         * If the transaction pointer is NULL, just call the normal
@@ -138,34 +137,7 @@ xfs_trans_iget(
        }
        ASSERT(ip != NULL);
-        /*
+        xfs_trans_ijoin(tp, ip, lock_flags);
-         * Get a log_item_desc to point at the new item.
-         */
-        if (ip->i_itemp == NULL)
-                xfs_inode_item_init(ip, mp);
-        iip = ip->i_itemp;
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
-        xfs_trans_inode_broot_debug(ip);
-        /*
-         * If the IO lock has been acquired, mark that in
-         * the inode log item so we'll know to unlock it
-         * when the transaction commits.
-         */
-        ASSERT(iip->ili_flags == 0);
-        if (lock_flags & XFS_IOLOCK_EXCL) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
-        } else if (lock_flags & XFS_IOLOCK_SHARED) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
-        }
-        /*
-         * Initialize i_transp so we can find it with xfs_inode_incore()
-         * above.
-         */
-        ip->i_transp = tp;
        *ipp = ip;
        return 0;
 }
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 3c666e8317f..e110bf57d7f 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,6 +22,14 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
+/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+#include "xfs_bit.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 STATIC int      xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
                                        int, int, xfs_lsn_t);
@@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                lidp->lid_size = 0;
                lip->li_desc = lidp;
                lip->li_mountp = tp->t_mountp;
+                lip->li_ailp = tp->t_mountp->m_ail;
                return lidp;
        }
@@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
        lidp->lid_size = 0;
        lip->li_desc = lidp;
        lip->li_mountp = tp->t_mountp;
+        lip->li_ailp = tp->t_mountp->m_ail;
        return lidp;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3c748c456ed..73e2ad39743 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -44,25 +44,93 @@ xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
                                                    xfs_extlen_t idx);
 /*
- * From xfs_trans_ail.c
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
 */
-void                    xfs_trans_update_ail(struct xfs_mount *mp,
+struct xfs_ail_cursor {
-                                     struct xfs_log_item *lip, xfs_lsn_t lsn)
+        struct xfs_ail_cursor   *next;
-                                     __releases(mp->m_ail_lock);
+        struct xfs_log_item     *item;
-void                    xfs_trans_delete_ail(struct xfs_mount *mp,
+};
-                                     struct xfs_log_item *lip)
-                                     __releases(mp->m_ail_lock);
-struct xfs_log_item     *xfs_trans_first_ail(struct xfs_mount *, int *);
-struct xfs_log_item     *xfs_trans_next_ail(struct xfs_mount *,
-                                     struct xfs_log_item *, int *, int *);
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+        struct xfs_mount        *xa_mount;
+        struct list_head        xa_ail;
+        uint                    xa_gen;
+        struct task_struct      *xa_task;
+        xfs_lsn_t               xa_target;
+        struct xfs_ail_cursor   xa_cursors;
+        spinlock_t              xa_lock;
+};
 /*
- * AIL push thread support
+ * From xfs_trans_ail.c
 */
-long    xfsaild_push(struct xfs_mount *, xfs_lsn_t *);
+void                    xfs_trans_ail_update(struct xfs_ail *ailp,
-void    xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t);
+                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
-int     xfsaild_start(struct xfs_mount *);
+                                        __releases(ailp->xa_lock);
-void    xfsaild_stop(struct xfs_mount *);
+void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+                                        struct xfs_log_item *lip)
+                                        __releases(ailp->xa_lock);
+void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_trans_unlocked_item(struct xfs_ail *,
+                                        xfs_log_item_t *);
+xfs_lsn_t               xfs_trans_ail_tail(struct xfs_ail *ailp);
+struct xfs_log_item     *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur,
+                                        xfs_lsn_t lsn);
+struct xfs_log_item     *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur);
+void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur);
+long    xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
+void    xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
+int     xfsaild_start(struct xfs_ail *);
+void    xfsaild_stop(struct xfs_ail *);
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       *dst,
+        xfs_lsn_t       *src)
+{
+        ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+        spin_lock(&ailp->xa_lock);
+        *dst = *src;
+        spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       *dst,
+        xfs_lsn_t       *src)
+{
+        ASSERT(sizeof(xfs_lsn_t) == 8);
+        *dst = *src;
+}
+#endif
 #endif  /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 35d4d414bcc..fcc2285d03e 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -172,6 +172,12 @@ xfs_dir_ialloc(
                        *ipp = NULL;
                        return code;
                }
+                /*
+                 * transaction commit worked ok so we can drop the extra ticket
+                 * reference that we gained in xfs_trans_dup()
+                 */
+                xfs_log_ticket_put(tp->t_ticket);
                code = xfs_trans_reserve(tp, 0, log_res, 0,
                                         XFS_TRANS_PERM_LOG_RES, log_count);
                /*
@@ -268,9 +274,9 @@ xfs_bump_ino_vers2(
        xfs_mount_t     *mp;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+        ASSERT(ip->i_d.di_version == 1);
-        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+        ip->i_d.di_version = 2;
        ip->i_d.di_onlink = 0;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        mp = tp->t_mountp;
@@ -302,7 +308,7 @@ xfs_bumplink(
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
        inc_nlink(VFS_I(ip));
-        if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+        if ((ip->i_d.di_version == 1) &&
            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
                /*
                 * The inode has increased its number of links beyond
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
deleted file mode 100644
index 439dd3939dd..00000000000
--- a/fs/xfs/xfs_vfsops.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_rw.h"
-#include "xfs_buf_item.h"
-#include "xfs_log_priv.h"
-#include "xfs_dir2_trace.h"
-#include "xfs_extfree_item.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_clnt.h"
-#include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
-#include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
-#include "xfs_utils.h"
-STATIC void
-xfs_quiesce_fs(
-        xfs_mount_t             *mp)
-{
-        int                     count = 0, pincount;
-        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_finish_reclaim_all(mp, 0);
-        /* This loop must run at least twice.
-         * The first instance of the loop will flush
-         * most meta data but that will generate more
-         * meta data (typically directory updates).
-         * Which then must be flushed and logged before
-         * we can write the unmount record.
-         */
-        do {
-                xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
-                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
-                if (!pincount) {
-                        delay(50);
-                        count++;
-                }
-        } while (count < 2);
-}
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
- */
-void
-xfs_attr_quiesce(
-        xfs_mount_t     *mp)
-{
-        int     error = 0;
-        /* wait for all modifications to complete */
-        while (atomic_read(&mp->m_active_trans) > 0)
-                delay(100);
-        /* flush inodes and push all remaining buffers out to disk */
-        xfs_quiesce_fs(mp);
-        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
-        /* Push the superblock and write an unmount record */
-        error = xfs_log_sbcount(mp, 1);
-        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
-                                "xfs_attr_quiesce: failed to log sb changes. "
-                                "Frozen image may not be consistent.");
-        xfs_log_unmount_write(mp);
-        xfs_unmountfs_writesb(mp);
-}
-/*
- * xfs_unmount_flush implements a set of flush operation on special
- * inodes, which are needed as a separate set of operations so that
- * they can be called as part of relocation process.
- */
-int
-xfs_unmount_flush(
-        xfs_mount_t     *mp,            /* Mount structure we are getting
-                                           rid of. */
-        int             relocation)     /* Called from vfs relocation. */
-{
-        xfs_inode_t     *rip = mp->m_rootip;
-        xfs_inode_t     *rbmip;
-        xfs_inode_t     *rsumip = NULL;
-        int             error;
-        xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        xfs_iflock(rip);
-        /*
-         * Flush out the real time inodes.
-         */
-        if ((rbmip = mp->m_rbmip) != NULL) {
-                xfs_ilock(rbmip, XFS_ILOCK_EXCL);
-                xfs_iflock(rbmip);
-                error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
-                xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
-                if (error == EFSCORRUPTED)
-                        goto fscorrupt_out;
-                ASSERT(vn_count(VFS_I(rbmip)) == 1);
-                rsumip = mp->m_rsumip;
-                xfs_ilock(rsumip, XFS_ILOCK_EXCL);
-                xfs_iflock(rsumip);
-                error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
-                xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
-                if (error == EFSCORRUPTED)
-                        goto fscorrupt_out;
-                ASSERT(vn_count(VFS_I(rsumip)) == 1);
-        }
-        /*
-         * Synchronously flush root inode to disk
-         */
-        error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
-        if (error == EFSCORRUPTED)
-                goto fscorrupt_out2;
-        if (vn_count(VFS_I(rip)) != 1 && !relocation) {
-                xfs_iunlock(rip, XFS_ILOCK_EXCL);
-                return XFS_ERROR(EBUSY);
-        }
-        /*
-         * Release dquot that rootinode, rbmino and rsumino might be holding,
-         * flush and purge the quota inodes.
-         */
-        error = XFS_QM_UNMOUNT(mp);
-        if (error == EFSCORRUPTED)
-                goto fscorrupt_out2;
-        if (rbmip) {
-                IRELE(rbmip);
-                IRELE(rsumip);
-        }
-        xfs_iunlock(rip, XFS_ILOCK_EXCL);
-        return 0;
-fscorrupt_out:
-        xfs_ifunlock(rip);
-fscorrupt_out2:
-        xfs_iunlock(rip, XFS_ILOCK_EXCL);
-        return XFS_ERROR(EFSCORRUPTED);
-}
-/*
- * xfs_sync flushes any pending I/O to file system vfsp.
- *
- * This routine is called by vfs_sync() to make sure that things make it
- * out to disk eventually, on sync() system calls to flush out everything,
- * and when the file system is unmounted.  For the vfs_sync() case, all
- * we really need to do is sync out the log to make all of our meta-data
- * updates permanent (except for timestamps).  For calls from pflushd(),
- * dirty pages are kept moving by calling pdflush() on the inodes
- * containing them.  We also flush the inodes that we can lock without
- * sleeping and the superblock if we can lock it without sleeping from
- * vfs_sync() so that items at the tail of the log are always moving out.
- *
- * Flags:
- *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
- *                     to sleep if we can help it.  All we really need
- *                     to do is ensure that the log is synced at least
- *                     periodically.  We also push the inodes and
- *                     superblock if we can lock them without sleeping
- *                      and they are not pinned.
- *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
- *                     set, then we really want to lock each inode and flush
- *                     it.
- *      SYNC_WAIT    - All the flushes that take place in this call should
- *                     be synchronous.
- *      SYNC_DELWRI  - This tells us to push dirty pages associated with
- *                     inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
- *                     determine if they should be flushed sync, async, or
- *                     delwri.
- *      SYNC_CLOSE   - This flag is passed when the system is being
- *                     unmounted.  We should sync and invalidate everything.
- *      SYNC_FSDATA  - This indicates that the caller would like to make
- *                     sure the superblock is safe on disk.  We can ensure
- *                     this by simply making sure the log gets flushed
- *                     if SYNC_BDFLUSH is set, and by actually writing it
- *                     out otherwise.
- *      SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
- *                     before we return (including direct I/O). Forms the drain
- *                     side of the write barrier needed to safely quiesce the
- *                     filesystem.
- *
- */
-int
-xfs_sync(
-        xfs_mount_t     *mp,
-        int             flags)
-{
-        int             error;
-        /*
-         * Get the Quota Manager to flush the dquots.
-         *
-         * If XFS quota support is not enabled or this filesystem
-         * instance does not use quotas XFS_QM_DQSYNC will always
-         * return zero.
-         */
-        error = XFS_QM_DQSYNC(mp, flags);
-        if (error) {
-                /*
-                 * If we got an IO error, we will be shutting down.
-                 * So, there's nothing more for us to do here.
-                 */
-                ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
-                if (XFS_FORCED_SHUTDOWN(mp))
-                        return XFS_ERROR(error);
-        }
-        if (flags & SYNC_IOWAIT)
-                xfs_filestream_flush(mp);
-        return xfs_syncsub(mp, flags, NULL);
-}
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_sync_inodes(
-        xfs_mount_t     *mp,
-        int             flags,
-        int             *bypassed)
-{
-        xfs_inode_t     *ip = NULL;
-        struct inode    *vp = NULL;
-        int             error;
-        int             last_error;
-        uint64_t        fflag;
-        uint            lock_flags;
-        uint            base_lock_flags;
-        boolean_t       mount_locked;
-        boolean_t       vnode_refed;
-        int             preempt;
-        xfs_iptr_t      *ipointer;
-#ifdef DEBUG
-        boolean_t       ipointer_in = B_FALSE;
-#define IPOINTER_SET    ipointer_in = B_TRUE
-#define IPOINTER_CLR    ipointer_in = B_FALSE
-#else
-#define IPOINTER_SET
-#define IPOINTER_CLR
-#endif
-/* Insert a marker record into the inode list after inode ip. The list
- * must be locked when this is called. After the call the list will no
- * longer be locked.
- */
-#define IPOINTER_INSERT(ip, mp) { \
-                ASSERT(ipointer_in == B_FALSE); \
-                ipointer->ip_mnext = ip->i_mnext; \
-                ipointer->ip_mprev = ip; \
-                ip->i_mnext = (xfs_inode_t *)ipointer; \
-                ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
-                preempt = 0; \
-                XFS_MOUNT_IUNLOCK(mp); \
-                mount_locked = B_FALSE; \
-                IPOINTER_SET; \
-        }
-/* Remove the marker from the inode list. If the marker was the only item
- * in the list then there are no remaining inodes and we should zero out
- * the whole list. If we are the current head of the list then move the head
- * past us.
- */
-#define IPOINTER_REMOVE(ip, mp) { \
-                ASSERT(ipointer_in == B_TRUE); \
-                if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
-                        ip = ipointer->ip_mnext; \
-                        ip->i_mprev = ipointer->ip_mprev; \
-                        ipointer->ip_mprev->i_mnext = ip; \
-                        if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
-                                mp->m_inodes = ip; \
-                        } \
-                } else { \
-                        ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
-                        mp->m_inodes = NULL; \
-                        ip = NULL; \
-                } \
-                IPOINTER_CLR; \
-        }
-#define XFS_PREEMPT_MASK        0x7f
-        ASSERT(!(flags & SYNC_BDFLUSH));
-        if (bypassed)
-                *bypassed = 0;
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return 0;
-        error = 0;
-        last_error = 0;
-        preempt = 0;
-        /* Allocate a reference marker */
-        ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
-        fflag = XFS_B_ASYNC;            /* default is don't wait */
-        if (flags & SYNC_DELWRI)
-                fflag = XFS_B_DELWRI;
-        if (flags & SYNC_WAIT)
-                fflag = 0;              /* synchronous overrides all */
-        base_lock_flags = XFS_ILOCK_SHARED;
-        if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
-                /*
-                 * We need the I/O lock if we're going to call any of
-                 * the flush/inval routines.
-                 */
-                base_lock_flags |= XFS_IOLOCK_SHARED;
-        }
-        XFS_MOUNT_ILOCK(mp);
-        ip = mp->m_inodes;
-        mount_locked = B_TRUE;
-        vnode_refed  = B_FALSE;
-        IPOINTER_CLR;
-        do {
-                ASSERT(ipointer_in == B_FALSE);
-                ASSERT(vnode_refed == B_FALSE);
-                lock_flags = base_lock_flags;
-                /*
-                 * There were no inodes in the list, just break out
-                 * of the loop.
-                 */
-                if (ip == NULL) {
-                        break;
-                }
-                /*
-                 * We found another sync thread marker - skip it
-                 */
-                if (ip->i_mount == NULL) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                vp = VFS_I(ip);
-                /*
-                 * If the vnode is gone then this is being torn down,
-                 * call reclaim if it is flushed, else let regular flush
-                 * code deal with it later in the loop.
-                 */
-                if (vp == NULL) {
-                        /* Skip ones already in reclaim */
-                        if (ip->i_flags & XFS_IRECLAIM) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                                ip = ip->i_mnext;
-                        } else if ((xfs_ipincount(ip) == 0) &&
-                                    xfs_iflock_nowait(ip)) {
-                                IPOINTER_INSERT(ip, mp);
-                                xfs_finish_reclaim(ip, 1,
-                                                XFS_IFLUSH_DELWRI_ELSE_ASYNC);
-                                XFS_MOUNT_ILOCK(mp);
-                                mount_locked = B_TRUE;
-                                IPOINTER_REMOVE(ip, mp);
-                        } else {
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                ip = ip->i_mnext;
-                        }
-                        continue;
-                }
-                if (VN_BAD(vp)) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
-                        XFS_MOUNT_IUNLOCK(mp);
-                        kmem_free(ipointer);
-                        return 0;
-                }
-                /*
-                 * Try to lock without sleeping.  We're out of order with
-                 * the inode list lock here, so if we fail we need to drop
-                 * the mount lock and try again.  If we're called from
-                 * bdflush() here, then don't bother.
-                 *
-                 * The inode lock here actually coordinates with the
-                 * almost spurious inode lock in xfs_ireclaim() to prevent
-                 * the vnode we handle here without a reference from
-                 * being freed while we reference it.  If we lock the inode
-                 * while it's on the mount list here, then the spurious inode
-                 * lock in xfs_ireclaim() after the inode is pulled from
-                 * the mount list will sleep until we release it here.
-                 * This keeps the vnode from being freed while we reference
-                 * it.
-                 */
-                if (xfs_ilock_nowait(ip, lock_flags) == 0) {
-                        if (vp == NULL) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        vp = vn_grab(vp);
-                        if (vp == NULL) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        IPOINTER_INSERT(ip, mp);
-                        xfs_ilock(ip, lock_flags);
-                        ASSERT(vp == VFS_I(ip));
-                        ASSERT(ip->i_mount == mp);
-                        vnode_refed = B_TRUE;
-                }
-                /* From here on in the loop we may have a marker record
-                 * in the inode list.
-                 */
-                /*
-                 * If we have to flush data or wait for I/O completion
-                 * we need to drop the ilock that we currently hold.
-                 * If we need to drop the lock, insert a marker if we
-                 * have not already done so.
-                 */
-                if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
-                    ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        if (flags & SYNC_CLOSE) {
-                                /* Shutdown case. Flush and invalidate. */
-                                if (XFS_FORCED_SHUTDOWN(mp))
-                                        xfs_tosspages(ip, 0, -1,
-                                                             FI_REMAPF);
-                                else
-                                        error = xfs_flushinval_pages(ip,
-                                                        0, -1, FI_REMAPF);
-                        } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
-                                error = xfs_flush_pages(ip, 0,
-                                                        -1, fflag, FI_NONE);
-                        }
-                        /*
-                         * When freezing, we need to wait ensure all I/O (including direct
-                         * I/O) is complete to ensure no further data modification can take
-                         * place after this point
-                         */
-                        if (flags & SYNC_IOWAIT)
-                                vn_iowait(ip);
-                        xfs_ilock(ip, XFS_ILOCK_SHARED);
-                }
-                if ((flags & SYNC_ATTR) &&
-                    (ip->i_update_core ||
-                     (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
-                        if (mount_locked)
-                                IPOINTER_INSERT(ip, mp);
-                        if (flags & SYNC_WAIT) {
-                                xfs_iflock(ip);
-                                error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-                        /*
-                         * If we can't acquire the flush lock, then the inode
-                         * is already being flushed so don't bother waiting.
-                         *
-                         * If we can lock it then do a delwri flush so we can
-                         * combine multiple inode flushes in each disk write.
-                         */
-                        } else if (xfs_iflock_nowait(ip)) {
-                                error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                        } else if (bypassed) {
-                                (*bypassed)++;
-                        }
-                }
-                if (lock_flags != 0) {
-                        xfs_iunlock(ip, lock_flags);
-                }
-                if (vnode_refed) {
-                        /*
-                         * If we had to take a reference on the vnode
-                         * above, then wait until after we've unlocked
-                         * the inode to release the reference.  This is
-                         * because we can be already holding the inode
-                         * lock when IRELE() calls xfs_inactive().
-                         *
-                         * Make sure to drop the mount lock before calling
-                         * IRELE() so that we don't trip over ourselves if
-                         * we have to go for the mount lock again in the
-                         * inactive code.
-                         */
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                        IRELE(ip);
-                        vnode_refed = B_FALSE;
-                }
-                if (error) {
-                        last_error = error;
-                }
-                /*
-                 * bail out if the filesystem is corrupted.
-                 */
-                if (error == EFSCORRUPTED)  {
-                        if (!mount_locked) {
-                                XFS_MOUNT_ILOCK(mp);
-                                IPOINTER_REMOVE(ip, mp);
-                        }
-                        XFS_MOUNT_IUNLOCK(mp);
-                        ASSERT(ipointer_in == B_FALSE);
-                        kmem_free(ipointer);
-                        return XFS_ERROR(error);
-                }
-                /* Let other threads have a chance at the mount lock
-                 * if we have looped many times without dropping the
-                 * lock.
-                 */
-                if ((++preempt & XFS_PREEMPT_MASK) == 0) {
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                }
-                if (mount_locked == B_FALSE) {
-                        XFS_MOUNT_ILOCK(mp);
-                        mount_locked = B_TRUE;
-                        IPOINTER_REMOVE(ip, mp);
-                        continue;
-                }
-                ASSERT(ipointer_in == B_FALSE);
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
-        XFS_MOUNT_IUNLOCK(mp);
-        ASSERT(ipointer_in == B_FALSE);
-        kmem_free(ipointer);
-        return XFS_ERROR(last_error);
-}
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_syncsub(
-        xfs_mount_t     *mp,
-        int             flags,
-        int             *bypassed)
-{
-        int             error = 0;
-        int             last_error = 0;
-        uint            log_flags = XFS_LOG_FORCE;
-        xfs_buf_t       *bp;
-        xfs_buf_log_item_t      *bip;
-        /*
-         * Sync out the log.  This ensures that the log is periodically
-         * flushed even if there is not enough activity to fill it up.
-         */
-        if (flags & SYNC_WAIT)
-                log_flags |= XFS_LOG_SYNC;
-        xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
-                if (flags & SYNC_BDFLUSH)
-                        xfs_finish_reclaim_all(mp, 1);
-                else
-                        error = xfs_sync_inodes(mp, flags, bypassed);
-        }
-        /*
-         * Flushing out dirty data above probably generated more
-         * log activity, so if this isn't vfs_sync() then flush
-         * the log again.
-         */
-        if (flags & SYNC_DELWRI) {
-                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        }
-        if (flags & SYNC_FSDATA) {
-                /*
-                 * If this is vfs_sync() then only sync the superblock
-                 * if we can lock it without sleeping and it is not pinned.
-                 */
-                if (flags & SYNC_BDFLUSH) {
-                        bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
-                        if (bp != NULL) {
-                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                                if ((bip != NULL) &&
-                                    xfs_buf_item_dirty(bip)) {
-                                        if (!(XFS_BUF_ISPINNED(bp))) {
-                                                XFS_BUF_ASYNC(bp);
-                                                error = xfs_bwrite(mp, bp);
-                                        } else {
-                                                xfs_buf_relse(bp);
-                                        }
-                                } else {
-                                        xfs_buf_relse(bp);
-                                }
-                        }
-                } else {
-                        bp = xfs_getsb(mp, 0);
-                        /*
-                         * If the buffer is pinned then push on the log so
-                         * we won't get stuck waiting in the write for
-                         * someone, maybe ourselves, to flush the log.
-                         * Even though we just pushed the log above, we
-                         * did not have the superblock buffer locked at
-                         * that point so it can become pinned in between
-                         * there and here.
-                         */
-                        if (XFS_BUF_ISPINNED(bp))
-                                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                        if (flags & SYNC_WAIT)
-                                XFS_BUF_UNASYNC(bp);
-                        else
-                                XFS_BUF_ASYNC(bp);
-                        error = xfs_bwrite(mp, bp);
-                }
-                if (error) {
-                        last_error = error;
-                }
-        }
-        /*
-         * Now check to see if the log needs a "dummy" transaction.
-         */
-        if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
-                xfs_trans_t *tp;
-                xfs_inode_t *ip;
-                /*
-                 * Put a dummy transaction in the log to tell
-                 * recovery that all others are OK.
-                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-                if ((error = xfs_trans_reserve(tp, 0,
-                                XFS_ICHANGE_LOG_RES(mp),
-                                0, 0, 0)))  {
-                        xfs_trans_cancel(tp, 0);
-                        return error;
-                }
-                ip = mp->m_rootip;
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                xfs_trans_ihold(tp, ip);
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                error = xfs_trans_commit(tp, 0);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        }
-        /*
-         * When shutting down, we need to insure that the AIL is pushed
-         * to disk or the filesystem can appear corrupt from the PROM.
-         */
-        if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
-                XFS_bflush(mp->m_ddev_targp);
-                if (mp->m_rtdev_targp) {
-                        XFS_bflush(mp->m_rtdev_targp);
-                }
-        }
-        return XFS_ERROR(last_error);
-}
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
deleted file mode 100644
index a74b05087da..00000000000
--- a/fs/xfs/xfs_vfsops.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _XFS_VFSOPS_H
-#define _XFS_VFSOPS_H 1
-struct cred;
-struct xfs_fid;
-struct inode;
-struct kstatfs;
-struct xfs_mount;
-struct xfs_mount_args;
-int xfs_sync(struct xfs_mount *mp, int flags);
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
-                int lnnum);
-void xfs_attr_quiesce(struct xfs_mount *mp);
-#endif /* _XFS_VFSOPS_H */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8b6812f66a1..f07bf8768c3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -54,33 +54,10 @@
 #include "xfs_vnodeops.h"
 int
-xfs_open(
-        xfs_inode_t     *ip)
-{
-        int             mode;
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return XFS_ERROR(EIO);
-        /*
-         * If it's a directory with any blocks, read-ahead block 0
-         * as we're almost certain to have the next operation be a read there.
-         */
-        if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
-                mode = xfs_ilock_map_shared(ip);
-                if (ip->i_d.di_nextents > 0)
-                        (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
-                xfs_iunlock(ip, mode);
-        }
-        return 0;
-}
-int
 xfs_setattr(
        struct xfs_inode        *ip,
        struct iattr            *iattr,
-        int                     flags,
+        int                     flags)
-        cred_t                  *credp)
 {
        xfs_mount_t             *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
@@ -93,7 +70,6 @@ xfs_setattr(
        gid_t                   gid=0, igid=0;
        int                     timeflags = 0;
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
-        int                     file_owner;
        int                     need_iolock = 1;
        xfs_itrace_entry(ip);
@@ -104,6 +80,10 @@ xfs_setattr(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
+        code = -inode_change_ok(inode, iattr);
+        if (code)
+                return code;
        olddquot1 = olddquot2 = NULL;
        udqp = gdqp = NULL;
@@ -181,62 +161,8 @@ xfs_setattr(
        xfs_ilock(ip, lock_flags);
-        /* boolean: are we the file owner? */
-        file_owner = (current_fsuid() == ip->i_d.di_uid);
-        /*
-         * Change various properties of a file.
-         * Only the owner or users with CAP_FOWNER
-         * capability may do these things.
-         */
-        if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
-                /*
-                 * CAP_FOWNER overrides the following restrictions:
-                 *
-                 * The user ID of the calling process must be equal
-                 * to the file owner ID, except in cases where the
-                 * CAP_FSETID capability is applicable.
-                 */
-                if (!file_owner && !capable(CAP_FOWNER)) {
-                        code = XFS_ERROR(EPERM);
-                        goto error_return;
-                }
-                /*
-                 * CAP_FSETID overrides the following restrictions:
-                 *
-                 * The effective user ID of the calling process shall match
-                 * the file owner when setting the set-user-ID and
-                 * set-group-ID bits on that file.
-                 *
-                 * The effective group ID or one of the supplementary group
-                 * IDs of the calling process shall match the group owner of
-                 * the file when setting the set-group-ID bit on that file
-                 */
-                if (mask & ATTR_MODE) {
-                        mode_t m = 0;
-                        if ((iattr->ia_mode & S_ISUID) && !file_owner)
-                                m |= S_ISUID;
-                        if ((iattr->ia_mode & S_ISGID) &&
-                            !in_group_p((gid_t)ip->i_d.di_gid))
-                                m |= S_ISGID;
-#if 0
-                        /* Linux allows this, Irix doesn't. */
-                        if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
-                                m |= S_ISVTX;
-#endif
-                        if (m && !capable(CAP_FSETID))
-                                iattr->ia_mode &= ~m;
-                }
-        }
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -251,23 +177,6 @@ xfs_setattr(
                uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
                /*
-                 * CAP_CHOWN overrides the following restrictions:
-                 *
-                 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
-                 * shall override the restriction that a process cannot
-                 * change the user ID of a file it owns and the restriction
-                 * that the group ID supplied to the chown() function
-                 * shall be equal to either the group ID or one of the
-                 * supplementary group IDs of the calling process.
-                 */
-                if (restricted_chown &&
-                    (iuid != uid || (igid != gid &&
-                                     !in_group_p((gid_t)gid))) &&
-                    !capable(CAP_CHOWN)) {
-                        code = XFS_ERROR(EPERM);
-                        goto error_return;
-                }
-                /*
                 * Do a quota reservation only if uid/gid is actually
                 * going to change.
                 */
@@ -304,36 +213,22 @@ xfs_setattr(
                        code = XFS_ERROR(EINVAL);
                        goto error_return;
                }
                /*
                 * Make sure that the dquots are attached to the inode.
                 */
-                if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
+                code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+                if (code)
                        goto error_return;
-        }
-        /*
-         * Change file access or modified times.
-         */
-        if (mask & (ATTR_ATIME|ATTR_MTIME)) {
-                if (!file_owner) {
-                        if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
-                            !capable(CAP_FOWNER)) {
-                                code = XFS_ERROR(EPERM);
-                                goto error_return;
-                        }
-                }
-        }
-        /*
+                /*
-         * Now we can make the changes.  Before we join the inode
+                 * Now we can make the changes.  Before we join the inode
-         * to the transaction, if ATTR_SIZE is set then take care of
+                 * to the transaction, if ATTR_SIZE is set then take care of
-         * the part of the truncation that must be done without the
+                 * the part of the truncation that must be done without the
-         * inode lock.  This needs to be done before joining the inode
+                 * inode lock.  This needs to be done before joining the inode
-         * to the transaction, because the inode cannot be unlocked
+                 * to the transaction, because the inode cannot be unlocked
-         * once it is a part of the transaction.
+                 * once it is a part of the transaction.
-         */
+                 */
-        if (mask & ATTR_SIZE) {
-                code = 0;
                if (iattr->ia_size > ip->i_size) {
                        /*
                         * Do the first part of growing a file: zero any data
@@ -366,7 +261,7 @@ xfs_setattr(
                }
                /* wait for all I/O to complete */
-                vn_iowait(ip);
+                xfs_ioend_wait(ip);
                if (!code)
                        code = xfs_itruncate_data(ip, iattr->ia_size);
@@ -388,17 +283,10 @@ xfs_setattr(
                }
                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-        }
-        if (tp) {
                xfs_trans_ijoin(tp, ip, lock_flags);
                xfs_trans_ihold(tp, ip);
-        }
-        /*
-         * Truncate file.  Must have write permission and not be a directory.
-         */
-        if (mask & ATTR_SIZE) {
                /*
                 * Only change the c/mtime if we are changing the size
                 * or we are explicitly asked to change it. This handles
@@ -438,28 +326,13 @@ xfs_setattr(
                         */
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
-        }
+        } else if (tp) {
+                xfs_trans_ijoin(tp, ip, lock_flags);
-        /*
+                xfs_trans_ihold(tp, ip);
-         * Change file access modes.
-         */
-        if (mask & ATTR_MODE) {
-                ip->i_d.di_mode &= S_IFMT;
-                ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
-                inode->i_mode &= S_IFMT;
-                inode->i_mode |= iattr->ia_mode & ~S_IFMT;
-                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
-                timeflags |= XFS_ICHGTIME_CHG;
        }
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -503,6 +376,24 @@ xfs_setattr(
                timeflags |= XFS_ICHGTIME_CHG;
        }
+        /*
+         * Change file access modes.
+         */
+        if (mask & ATTR_MODE) {
+                umode_t mode = iattr->ia_mode;
+                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                        mode &= ~S_ISGID;
+                ip->i_d.di_mode &= S_IFMT;
+                ip->i_d.di_mode |= mode & ~S_IFMT;
+                inode->i_mode &= S_IFMT;
+                inode->i_mode |= mode & ~S_IFMT;
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                timeflags |= XFS_ICHGTIME_CHG;
+        }
        /*
         * Change file access or modified times.
@@ -713,7 +604,7 @@ xfs_fsync(
                return XFS_ERROR(EIO);
        /* capture size updates in I/O completion before writing the inode. */
-        error = filemap_fdatawait(VFS_I(ip)->i_mapping);
+        error = xfs_wait_on_pages(ip, 0, -1);
        if (error)
                return XFS_ERROR(error);
@@ -1029,6 +920,12 @@ xfs_inactive_symlink_rmt(
                goto error0;
        }
        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(tp->t_ticket);
+        /*
         * Remove the memory for extent descriptions (just bookkeeping).
         */
        if (ip->i_df.if_bytes)
@@ -1625,8 +1522,6 @@ xfs_create(
                xfs_trans_set_sync(tp);
        }
-        dp->i_gen++;
        /*
         * Attach the dquot(s) to the inodes and modify them incore.
         * These ids of the inode couldn't have changed since the new
@@ -1993,13 +1888,6 @@ xfs_remove(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
-         * Bump the in memory generation count on the parent
-         * directory so that other can know that it has changed.
-         */
-        dp->i_gen++;
-        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        if (is_dir) {
                /*
                 * Drop the link from ip's "..".
@@ -2009,7 +1897,7 @@ xfs_remove(
                        goto out_bmap_cancel;
                /*
-                 * Drop the link from dp to ip.
+                 * Drop the "." link from ip to self.
                 */
                error = xfs_droplink(tp, ip);
                if (error)
@@ -2017,14 +1905,14 @@ xfs_remove(
        } else {
                /*
                 * When removing a non-directory we need to log the parent
-                 * inode here for the i_gen update.  For a directory this is
+                 * inode here.  For a directory this is done implicitly
-                 * done implicitly by the xfs_droplink call for the ".." entry.
+                 * by the xfs_droplink call for the ".." entry.
                 */
                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        }
        /*
-         * Drop the "." link from ip to self.
+         * Drop the link from dp to ip.
         */
        error = xfs_droplink(tp, ip);
        if (error)
@@ -2178,7 +2066,6 @@ xfs_link(
        if (error)
                goto abort_return;
        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        tdp->i_gen++;
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
        error = xfs_bumplink(tp, sip);
@@ -2355,18 +2242,10 @@ xfs_mkdir(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
-         * Bump the in memory version number of the parent directory
-         * so that other processes accessing it will recognize that
-         * the directory has changed.
-         */
-        dp->i_gen++;
        error = xfs_dir_init(tp, cdp, dp);
        if (error)
                goto error2;
-        cdp->i_gen = 1;
        error = xfs_bumplink(tp, dp);
        if (error)
                goto error2;
@@ -2653,13 +2532,6 @@ xfs_symlink(
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
-         * Bump the in memory version number of the parent directory
-         * so that other processes accessing it will recognize that
-         * the directory has changed.
-         */
-        dp->i_gen++;
-        /*
         * If this is a synchronous mount, make sure that the
         * symlink transaction goes to disk before returning to
         * the user.
@@ -2809,7 +2681,7 @@ xfs_reclaim(
                return 0;
        }
-        vn_iowait(ip);
+        xfs_ioend_wait(ip);
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -2833,122 +2705,10 @@ xfs_reclaim(
        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_iflock(ip);
-                return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+                xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-        } else {
+                return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
-                xfs_mount_t     *mp = ip->i_mount;
-                /* Protect sync and unpin from us */
-                XFS_MOUNT_ILOCK(mp);
-                spin_lock(&ip->i_flags_lock);
-                __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-                VFS_I(ip)->i_private = NULL;
-                ip->i_vnode = NULL;
-                spin_unlock(&ip->i_flags_lock);
-                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
-                XFS_MOUNT_IUNLOCK(mp);
-        }
-        return 0;
-}
-int
-xfs_finish_reclaim(
-        xfs_inode_t     *ip,
-        int             locked,
-        int             sync_mode)
-{
-        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        struct inode    *vp = VFS_I(ip);
-        if (vp && VN_BAD(vp))
-                goto reclaim;
-        /* The hash lock here protects a thread in xfs_iget_core from
-         * racing with us on linking the inode back with a vnode.
-         * Once we have the XFS_IRECLAIM flag set it will not touch
-         * us.
-         */
-        write_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-            (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                if (locked) {
-                        xfs_ifunlock(ip);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                }
-                return 1;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(ip->i_mount, pag);
-        /*
-         * If the inode is still dirty, then flush it out.  If the inode
-         * is not in the AIL, then it will be OK to flush it delwri as
-         * long as xfs_iflush() does not keep any references to the inode.
-         * We leave that decision up to xfs_iflush() since it has the
-         * knowledge of whether it's OK to simply do a delwri flush of
-         * the inode or whether we need to wait until the inode is
-         * pulled from the AIL.
-         * We get the flush lock regardless, though, just to make sure
-         * we don't free it while it is being flushed.
-         */
-        if (!locked) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_iflock(ip);
        }
+        xfs_inode_set_reclaim_tag(ip);
-        /*
-         * In the case of a forced shutdown we rely on xfs_iflush() to
-         * wait for the inode to be unpinned before returning an error.
-         */
-        if (xfs_iflush(ip, sync_mode) == 0) {
-                /* synchronize with xfs_iflush_done */
-                xfs_iflock(ip);
-                xfs_ifunlock(ip);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
- reclaim:
-        xfs_ireclaim(ip);
-        return 0;
-}
-int
-xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
-{
-        int             purged;
-        xfs_inode_t     *ip, *n;
-        int             done = 0;
-        while (!done) {
-                purged = 0;
-                XFS_MOUNT_ILOCK(mp);
-                list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
-                        if (noblock) {
-                                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
-                                        continue;
-                                if (xfs_ipincount(ip) ||
-                                    !xfs_iflock_nowait(ip)) {
-                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                        continue;
-                                }
-                        }
-                        XFS_MOUNT_IUNLOCK(mp);
-                        if (xfs_finish_reclaim(ip, noblock,
-                                        XFS_IFLUSH_DELWRI_ELSE_ASYNC))
-                                delay(1);
-                        purged = 1;
-                        break;
-                }
-                done = !purged;
-        }
-        XFS_MOUNT_IUNLOCK(mp);
        return 0;
 }
@@ -3197,6 +2957,8 @@ xfs_zero_remaining_bytes(
        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
                                XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp);
+        if (!bp)
+                return XFS_ERROR(ENOMEM);
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -3312,7 +3074,8 @@ xfs_free_file_space(
                need_iolock = 0;
        if (need_iolock) {
                xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                vn_iowait(ip);  /* wait for the completion of any pending DIOs */
+                /* wait for the completion of any pending DIOs */
+                xfs_ioend_wait(ip);
        }
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -3474,7 +3237,6 @@ xfs_change_file_space(
        int             cmd,
        xfs_flock64_t   *bf,
        xfs_off_t       offset,
-        cred_t          *credp,
        int             attr_flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -3562,7 +3324,7 @@ xfs_change_file_space(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = startoffset;
-                error = xfs_setattr(ip, &iattr, attr_flags, credp);
+                error = xfs_setattr(ip, &iattr, attr_flags);
                if (error)
                        return error;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index e932a96bec5..76df328c61b 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -14,9 +14,7 @@ struct xfs_inode;
 struct xfs_iomap;
-int xfs_open(struct xfs_inode *ip);
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
-int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
-                struct cred *credp);
 #define XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
@@ -28,24 +26,23 @@ int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode **ipp, struct xfs_name *ci_name);
 int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
-                xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
+                xfs_dev_t rdev, struct xfs_inode **ipp, cred_t *credp);
 int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                struct xfs_inode *ip);
 int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
                struct xfs_name *target_name);
 int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
-                mode_t mode, struct xfs_inode **ipp, struct cred *credp);
+                mode_t mode, struct xfs_inode **ipp, cred_t *credp);
 int xfs_readdir(struct xfs_inode        *dp, void *dirent, size_t bufsize,
                       xfs_off_t *offset, filldir_t filldir);
 int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
                const char *target_path, mode_t mode, struct xfs_inode **ipp,
-                struct cred *credp);
+                cred_t *credp);
 int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
-                xfs_flock64_t *bf, xfs_off_t offset,
+                xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
-                struct cred *credp, int attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                struct xfs_name *target_name, struct xfs_inode *target_ip);
@@ -56,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
 int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
-                int ioflags, unsigned int cmd, void __user *arg);
 ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
                const struct iovec *iovp, unsigned int segs,
                loff_t *offset, int ioflags);
@@ -78,5 +73,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, int fiopt);
 int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, uint64_t flags, int fiopt);
+int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
 #endif /* _XFS_VNODEOPS_H */
author	David Woodhouse <David.Woodhouse@intel.com>	2009-01-05 04:50:33 -0500
committer	David Woodhouse <David.Woodhouse@intel.com>	2009-01-05 04:50:33 -0500
commit	353816f43d1fb340ff2d9a911dd5d0799c09f6a5 (patch)
tree	517290fd884d286fe2971137ac89f89e3567785a /fs
parent	160bbab3000dafccbe43688e48208cecf4deb879 (diff)
parent	fe0bdec68b77020281dc814805edfe594ae89e0f (diff)